diff --git a/README.md b/README.md index ecad95e58..b37348a74 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,6 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics -- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138 - New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow - Collecting Apple Silicon performance stats: - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167 @@ -291,7 +290,7 @@ In order to build llama.cpp you have three different options. sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \ opencl clblast openblas - gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 + gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 ``` **Notes:** With this packages you can build llama.cpp with OPENBLAS and @@ -614,9 +613,9 @@ Building the program with BLAS support may lead to some performance improvements # obtain the original LLaMA model weights and place them in ./models ls ./models 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model - # [Optional] for models using BPE tokenizers - ls ./models - 65B 30B 13B 7B vocab.json +# [Optional] for models using BPE tokenizers +ls ./models +65B 30B 13B 7B vocab.json # install Python dependencies python3 -m pip install -r requirements.txt @@ -624,8 +623,8 @@ python3 -m pip install -r requirements.txt # convert the 7B model to ggml FP16 format python3 convert.py models/7B/ - # [Optional] for models using BPE tokenizers - python convert.py models/7B/ --vocabtype bpe +# [Optional] for models using BPE tokenizers +python convert.py models/7B/ --vocabtype bpe # quantize the model to 4-bits (using q4_0 method) ./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0 diff --git a/common/common.cpp b/common/common.cpp index 288013676..0dd1c50cf 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1521,6 +1521,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index f239415d3..542cc7bb8 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -563,6 +563,7 @@ struct test { static const bool cuda; static const bool opencl; static const bool vulkan; + static const bool kompute; static const bool metal; static const bool gpu_blas; static const bool blas; @@ -647,6 +648,9 @@ struct test { if (vulkan) { return "Vulkan"; } + if (kompute) { + return "Kompute"; + } if (metal) { return "Metal"; } @@ -662,7 +666,7 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", - "cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas", + "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", @@ -686,8 +690,9 @@ struct test { field == "avg_ns" || field == "stddev_ns") { return INT; } - if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { + if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || + field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" || + field == "mul_mat_q") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -714,7 +719,8 @@ struct test { } std::vector values = { build_commit, std::to_string(build_number), - std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), + std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), + std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), @@ -743,6 +749,7 @@ const int test::build_number = LLAMA_BUILD_NUMBER; const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::vulkan = !!ggml_cpu_has_vulkan(); +const bool test::kompute = !!ggml_cpu_has_kompute(); const bool test::metal = !!ggml_cpu_has_metal(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::blas = !!ggml_cpu_has_blas(); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 58b7f807a..1c6138d23 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -39,6 +39,17 @@ static std::ostringstream * g_output_ss; static std::vector * g_output_tokens; static bool is_interacting = false; +static bool file_exists(const std::string &path) { + std::ifstream f(path.c_str()); + return f.good(); +} + +static bool file_is_empty(const std::string &path) { + std::ifstream f; + f.exceptions(std::ifstream::failbit | std::ifstream::badbit); + f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + return f.tellg() == 0; +} static void write_logfile( const llama_context * ctx, const gpt_params & params, const llama_model * model, @@ -215,12 +226,12 @@ int main(int argc, char ** argv) { if (!path_session.empty()) { LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); - - // fopen to check for existing session - FILE * fp = std::fopen(path_session.c_str(), "rb"); - if (fp != NULL) { - std::fclose(fp); - + if (!file_exists(path_session)) { + LOG_TEE("%s: session file does not exist, will create.\n", __func__); + } else if (file_is_empty(path_session)) { + LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__); + } else { + // The file exists and is not empty session_tokens.resize(n_ctx); size_t n_token_count_out = 0; if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { @@ -229,10 +240,7 @@ int main(int argc, char ** argv) { } session_tokens.resize(n_token_count_out); llama_set_rng_seed(ctx, params.seed); - - LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); - } else { - LOG_TEE("%s: session file does not exist, will create\n", __func__); + LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); } } diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 773024160..6d5f213dc 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -378,6 +378,8 @@ int main(int argc, char ** argv) { printf("testing %s ...\n", ggml_type_name(type)); } + ggml_quantize_init(type); + error_stats global_stats {}; for (const auto& kv_tensor : tensors) { diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index f4786157e..a9673f0d4 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -25,6 +25,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, + { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, @@ -36,7 +37,7 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, diff --git a/examples/server/README.md b/examples/server/README.md index dce4ec47c..fe934dab1 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -4,34 +4,35 @@ This example demonstrates a simple HTTP API server and a simple web front end to Command line options: -- `--threads N`, `-t N`: Set the number of threads to use during generation. -- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. -- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). -- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. -- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. -- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. -- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. -- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. -- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. -- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. -- `--numa`: Attempt optimizations that help on some NUMA systems. -- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. -- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. -- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. -- `--port`: Set the port to listen. Default: `8080`. -- `--path`: path from which to serve static files (default examples/server/public) -- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. -- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s. -- `--embedding`: Enable embedding extraction, Default: disabled. -- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1) -- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) -- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) -- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. -- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` -- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` +- `--threads N`, `-t N`: Set the number of threads to use during generation. +- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. +- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). +- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. +- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. +- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. +- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. +- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. +- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. +- `--numa`: Attempt optimizations that help on some NUMA systems. +- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. +- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. +- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. +- `--port`: Set the port to listen. Default: `8080`. +- `--path`: path from which to serve static files (default examples/server/public) +- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. +- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s. +- `--embedding`: Enable embedding extraction, Default: disabled. +- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1) +- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) +- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) +- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. +- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` +- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` + ## Build server is build alongside everything else from the root of the project @@ -52,21 +53,23 @@ server is build alongside everything else from the root of the project To get started right away, run the following command, making sure to use the correct path for the model you have: -### Unix-based systems (Linux, macOS, etc.): +### Unix-based systems (Linux, macOS, etc.) ```bash ./server -m models/7B/ggml-model.gguf -c 2048 ``` -### Windows: +### Windows ```powershell server.exe -m models\7B\ggml-model.gguf -c 2048 ``` + The above command will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. -### Docker: +### Docker + ```bash docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 @@ -120,12 +123,13 @@ node index.js ``` ## API Endpoints -- **GET** `/health`: Returns the current state of the server: - - `{"status": "loading model"}` if the model is still being loaded. - - `{"status": "error"}` if the model failed to load. - - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. -- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. +- **GET** `/health`: Returns the current state of the server: + - `{"status": "loading model"}` if the model is still being loaded. + - `{"status": "error"}` if the model failed to load. + - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. + +- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. *Options:* @@ -189,14 +193,13 @@ node index.js `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) -### Result JSON: - -* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. +### Result JSON +- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: -``` +```json { "content": "", "probs": [ @@ -212,6 +215,7 @@ node index.js ] }, ``` + Notice that each `probs` is an array of length `n_probs`. - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. @@ -228,7 +232,7 @@ Notice that each `probs` is an array of length `n_probs`. - `tokens_evaluated`: Number of tokens evaluated in total from the prompt - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) -- **POST** `/tokenize`: Tokenize a given text. +- **POST** `/tokenize`: Tokenize a given text. *Options:* @@ -236,13 +240,13 @@ Notice that each `probs` is an array of length `n_probs`. Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`. -- **POST** `/detokenize`: Convert tokens to text. +- **POST** `/detokenize`: Convert tokens to text. *Options:* `tokens`: Set the tokens to detokenize. -- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. +- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. *Options:* @@ -250,7 +254,7 @@ Notice that each `probs` is an array of length `n_probs`. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. -- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. +- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. *Options:* @@ -260,9 +264,9 @@ Notice that each `probs` is an array of length `n_probs`. It also accepts all the options of `/completion` except `stream` and `prompt`. -- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots. +- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots. -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. *Options:* @@ -290,6 +294,7 @@ Notice that each `probs` is an array of length `n_probs`. print(completion.choices[0].message) ``` + ... or raw HTTP requests: ```shell @@ -311,6 +316,40 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` +- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API. + + *Options:* + + See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). + + *Examples:* + + - input as string + + ```shell + curl http://localhost:8080/v1/embeddings \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "input": "hello", + "model":"GPT-4", + "encoding_format": "float" + }' + ``` + + - `input` as string array + + ```shell + curl http://localhost:8080/v1/embeddings \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer no-key" \ + -d '{ + "input": ["hello", "world"], + "model":"GPT-4", + "encoding_format": "float" + }' + ``` + ## More examples ### Change system prompt on runtime @@ -362,6 +401,7 @@ python api_like_OAI.py ``` After running the API server, you can use it in Python by setting the API base URL. + ```python openai.api_base = "http://:port" ``` diff --git a/examples/server/chat.sh b/examples/server/chat.sh index 014360121..da0a6ca68 100755 --- a/examples/server/chat.sh +++ b/examples/server/chat.sh @@ -48,6 +48,7 @@ chat_completion() { top_p: 0.9, n_keep: $n_keep, n_predict: 256, + cache_prompt: true, stop: ["\n### Human:"], stream: true }')" diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 11dd82c33..21bdce8ed 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -185,7 +185,7 @@ struct llama_client_slot llama_sampling_context *ctx_sampling = nullptr; int32_t ga_i = 0; // group-attention state - int32_t ga_n = 1;// group-attention factor + int32_t ga_n = 1; // group-attention factor int32_t ga_w = 512; // group-attention width int32_t n_past_se = 0; // self-extend @@ -219,7 +219,8 @@ struct llama_client_slot sent_token_probs_index = 0; infill = false; ga_i = 0; - n_past_se = 0; + n_past_se = 0; + generated_token_probs.clear(); for (slot_image & img : images) @@ -1227,7 +1228,7 @@ struct llama_server_context std::vector append_tokens = tokenize(json_prompt, false); // has next image for (int i = 0; i < (int) append_tokens.size(); ++i) { - llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true); + llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true); slot.n_past += 1; } } @@ -1295,6 +1296,8 @@ struct llama_server_context for (llama_client_slot &slot : slots) { slot.cache_tokens.clear(); + slot.n_past = 0; + slot.n_past_se = 0; } } @@ -1364,26 +1367,26 @@ struct llama_server_context kv_cache_clear(); } return true; - } else { - task_server task; - task.type = TASK_TYPE_NEXT_RESPONSE; - task.target_id = -1; - queue_tasks.post(task); } + task_server task; + task.type = TASK_TYPE_NEXT_RESPONSE; + task.target_id = -1; + queue_tasks.post(task); + for (llama_client_slot &slot : slots) { if (slot.ga_n == 1) { - if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx) + if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx) { // Shift context - const int n_left = slot.n_past - slot.params.n_keep - 1; + const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1; const int n_discard = n_left / 2; LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard); + llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard); for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1429,8 +1432,10 @@ struct llama_server_context slot.i_batch = batch.n_tokens; const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true); + // TODO: we always have to take into account the "system_tokens" + // this is not great and needs to be improved somehow + llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true); slot.n_past += 1; } @@ -1481,8 +1486,8 @@ struct llama_server_context prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); + prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); prefix_tokens.push_back(llama_token_middle(model)); prompt_tokens = prefix_tokens; } @@ -1582,8 +1587,8 @@ struct llama_server_context } LOG_VERBOSE("prompt ingested", { - {"n_past", slot.n_past}, - {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, + {"n_past", slot.n_past}, + {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())}, }); @@ -1591,10 +1596,13 @@ struct llama_server_context // process the prefix of first image std::vector prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; + int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - int ga_i = slot.ga_i; + + int32_t ga_i = slot.ga_i; int32_t ga_n = slot.ga_n; int32_t ga_w = slot.ga_w; + for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) { if (slot.ga_n != 1) @@ -1606,7 +1614,7 @@ struct llama_server_context } } llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false); - slot_npast += 1; + slot_npast++; } if (has_images && !ingest_images(slot, n_batch)) @@ -1666,6 +1674,7 @@ struct llama_server_context slot.n_past_se += n_tokens; } } + llama_batch batch_view = { n_tokens, @@ -1782,51 +1791,51 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) { - printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); + printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } if (llama_mmap_supported()) { - printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - printf(" --numa attempt optimizations that help on some NUMA systems\n"); + printf(" --numa attempt optimizations that help on some NUMA systems\n"); #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); + printf(" number of layers to store in VRAM\n"); printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf(" how to split the model across multiple GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf(" - layer (default): split layers and KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); - printf(" or for intermediate results and KV (with split-mode = row)\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row)\n"); #endif printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: %s)\n", params.model.c_str()); + printf(" model path (default: %s)\n", params.model.c_str()); printf(" -a ALIAS, --alias ALIAS\n"); - printf(" set an alias for the model, will be added as `model` field in completion response\n"); - printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); - printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); - printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); - printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); - printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); - printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); - printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); - printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); - printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); - printf(" -spf FNAME, --system-prompt-file FNAME\n"); - printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); - printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); - printf(" --log-disable disables logging to a file.\n"); + printf(" set an alias for the model, will be added as `model` field in completion response\n"); + printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); + printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); + printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); + printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); + printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); + printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); + printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); + printf(" -spf FNAME, --system-prompt-file FNAME\n"); + printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); + printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); + printf(" --log-disable disables logging to a file.\n"); printf("\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); - printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); - printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); - printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); + printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); + printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); + printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); printf("\n"); } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7695b86b2..949bc8a1c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -191,6 +191,10 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) { #endif // __has_builtin(__builtin_elementwise_sub_sat) } +static __device__ __forceinline__ int __vsub4(const int a, const int b) { + return __vsubss4(a, b); +} + static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) c = __builtin_amdgcn_sdot4(a, b, c, false); @@ -505,6 +509,14 @@ typedef struct { } block_iq2_xs; static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); +#define QR3_XXS 8 +#define QI3_XXS (QK_K / (4*QR3_XXS)) +typedef struct { + half d; + uint8_t qs[3*(QK_K/8)]; +} block_iq3_xxs; +static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); + #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -1613,6 +1625,41 @@ static const __device__ uint64_t iq2xs_grid[512] = { 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, }; +static const __device__ uint32_t iq3xxs_grid[256] = { + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, +}; + static const __device__ uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -1624,6 +1671,43 @@ static const __device__ uint8_t ksigns_iq2xs[128] = { 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, }; +//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +static const __device__ uint64_t ksigns64[128] = { + 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, + 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, + 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff, + 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff, + 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff, + 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff, + 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff, + 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff, + 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff, + 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff, + 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff, + 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff, + 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff, + 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff, + 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff, + 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff, + 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff, + 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff, + 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff, + 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff, + 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff, + 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff, + 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff, + 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff, + 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff, + 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff, + 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff, + 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff, + 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff, + 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff, + 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff, + 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff, +}; +//#endif + static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; inline bool ggml_cuda_supports_mmq(enum ggml_type type) { @@ -1690,6 +1774,34 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst } +template +static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq3_xxs * x = (const block_iq3_xxs *) vx; + + const int tid = threadIdx.x; +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * q3 = x[i].qs + 8*ib; + const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f; + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -4313,6 +4425,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics #if QK_K == 256 const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; @@ -4323,20 +4436,22 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( const uint8_t ls2 = bq2->scales[ib32] >> 4; int sumi1 = 0; for (int l = 0; l < 2; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); - const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; - for (int j = 0; j < 8; ++j) { - sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } + const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); + const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]); + const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]); + sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1); q8 += 8; } int sumi2 = 0; for (int l = 2; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); - const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; - for (int j = 0; j < 8; ++j) { - sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } + const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); + const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]); + const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]); + sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2); q8 += 8; } const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; @@ -4345,6 +4460,45 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( assert(false); return 0.f; #endif +#else + assert(false); + return 0.f; +#endif +} + +static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; + + const int ib32 = iqs; + const uint8_t * q3 = bq2->qs + 8*ib32; + const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + uint32_t aux32 = gas[0] | (gas[1] << 16); + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0]; + const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1]; + const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127)); + const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]); + const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]); + sumi = __dp4a(grid_l, *((int *)q8+0), sumi); + sumi = __dp4a(grid_h, *((int *)q8+1), sumi); + q8 += 8; + aux32 >>= 7; + } + const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f; + return d * sumi; +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif } template static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { return; } - // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor // then combine those indices with the corresponding byte offsets to get the total offsets - const int i02 = i / (ne00*ne01); - const int i01 = (i - i02*ne01*ne00) / ne00; - const int i00 = i - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + const int i03 = i/(ne00 * ne01 * ne02); + const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - const int i12 = i / (ne10*ne11); - const int i11 = (i - i12*ne10*ne11) / ne10; - const int i10 = i - i12*ne10*ne11 - i11*ne10; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + const int i13 = i/(ne10 * ne11 * ne12); + const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; cpy_1(cx + x_offset, cdst + dst_offset); } @@ -5471,23 +5635,26 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { template static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13) { const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk; if (i >= ne) { return; } - const int i02 = i / (ne00*ne01); - const int i01 = (i - i02*ne01*ne00) / ne00; - const int i00 = (i - i02*ne01*ne00 - i01*ne00); - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02; + const int i03 = i/(ne00 * ne01 * ne02); + const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - const int i12 = i / (ne10*ne11); - const int i11 = (i - i12*ne10*ne11) / ne10; - const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12; + const int i13 = i/(ne10 * ne11 * ne12); + const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; cpy_blck(cx + x_offset, cdst + dst_offset); } @@ -6381,6 +6548,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq2_xs<<>>(vx, y); } +template +static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq3_xxs<<>>(vx, y); +} + template static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; @@ -6418,6 +6591,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq2_xxs_cuda; case GGML_TYPE_IQ2_XS: return dequantize_row_iq2_xs_cuda; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_cuda; case GGML_TYPE_F32: return convert_unary_cuda; default: @@ -6451,6 +6626,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq2_xxs_cuda; case GGML_TYPE_IQ2_XS: return dequantize_row_iq2_xs_cuda; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -6663,6 +6840,15 @@ static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float <<>>(vx, vy, dst, ncols, nrows); } +static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + static void ggml_mul_mat_q4_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { @@ -7135,69 +7321,82 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda( (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x); } + +static void ggml_cpy_f16_f32_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + static void ggml_cpy_f32_f32_cuda( const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } static void ggml_cpy_f32_f16_cuda( const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } static void ggml_cpy_f32_q8_0_cuda( const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { GGML_ASSERT(ne % QK8_0 == 0); const int num_blocks = ne / QK8_0; cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } static void ggml_cpy_f32_q4_0_cuda( const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { GGML_ASSERT(ne % QK4_0 == 0); const int num_blocks = ne / QK4_0; cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } static void ggml_cpy_f32_q4_1_cuda( const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { GGML_ASSERT(ne % QK4_1 == 0); const int num_blocks = ne / QK4_1; cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } static void ggml_cpy_f16_f16_cuda( const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int nb00, const int nb01, const int nb02, - const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) { + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12); + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } + + static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; scale_f32<<>>(x, dst, scale, k); @@ -8213,6 +8412,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_RDNA2 ? 128 : 64; default: GGML_ASSERT(false); @@ -8235,6 +8435,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; @@ -8306,6 +8507,9 @@ static void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ2_XS: mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; default: GGML_ASSERT(false); break; @@ -9941,19 +10145,25 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; - GGML_ASSERT(src0->ne[3] == 1); + const int64_t ne02 = src0->ne[2]; + + //GGML_ASSERT(src0->ne[3] == 1); const int64_t nb00 = src0->nb[0]; const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; + const int64_t nb03 = src0->nb[3]; const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; - GGML_ASSERT(src1->ne[3] == 1); + const int64_t ne12 = src1->ne[2]; + + //GGML_ASSERT(src1->ne[3] == 1); const int64_t nb10 = src1->nb[0]; const int64_t nb11 = src1->nb[1]; const int64_t nb12 = src1->nb[2]; + const int64_t nb13 = src1->nb[3]; ggml_cuda_set_device(g_main_device); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; @@ -9965,17 +10175,19 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream); + ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); @@ -10934,7 +11146,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) { + if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) { if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; } @@ -10978,6 +11190,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { return true; } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return true; + } return false; } break; case GGML_OP_DUP: diff --git a/ggml-metal.h b/ggml-metal.h index df83a1807..a5c542189 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -57,6 +57,9 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(voi // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family); +// capture all command buffers committed the next time `ggml_backend_graph_compute` is called +GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); + #ifdef __cplusplus } #endif diff --git a/ggml-metal.m b/ggml-metal.m index bbeedaae0..f87859552 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -60,6 +60,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, @@ -81,6 +82,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, @@ -98,6 +100,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, @@ -112,6 +115,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, @@ -126,6 +130,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32, GGML_METAL_KERNEL_TYPE_ROPE_F16, GGML_METAL_KERNEL_TYPE_ALIBI_F32, @@ -163,6 +168,8 @@ struct ggml_metal_context { bool support_simdgroup_reduction; bool support_simdgroup_mm; + + bool should_capture_next_compute; }; // MSL code @@ -349,6 +356,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false"); GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + ctx->should_capture_next_compute = false; + #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6); @@ -422,6 +431,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); @@ -443,6 +453,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); @@ -460,6 +471,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); @@ -474,6 +486,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); @@ -488,6 +501,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); @@ -677,6 +691,20 @@ static bool ggml_metal_graph_compute( const int n_cb = ctx->n_cb; const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; + const bool should_capture = ctx->should_capture_next_compute; + if (should_capture) { + ctx->should_capture_next_compute = false; + + MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new]; + descriptor.captureObject = ctx->queue; + + NSError * error = nil; + if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { + GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); + GGML_ASSERT(!"capture failed"); + } + } + id command_buffer_builder[n_cb]; for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; @@ -685,6 +713,7 @@ static bool ggml_metal_graph_compute( // enqueue the command buffers in order to specify their execution order [command_buffer enqueue]; } + const id *command_buffers = command_buffer_builder; dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) { @@ -731,9 +760,9 @@ static bool ggml_metal_graph_compute( GGML_ASSERT(!"unsupported op"); } -#ifndef GGML_METAL_NDEBUG - [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; -#endif + if (should_capture) { + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; + } const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; @@ -1260,6 +1289,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break; case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } @@ -1388,6 +1418,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline; } break; + case GGML_TYPE_IQ3_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); @@ -1430,6 +1466,11 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src0t == GGML_TYPE_IQ3_XXS) { + const int mem_size = 256*4+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1524,6 +1565,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break; case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } @@ -1655,6 +1697,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline; } break; + case GGML_TYPE_IQ3_XXS: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); @@ -1713,6 +1761,11 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } + else if (src2t == GGML_TYPE_IQ3_XXS) { + const int mem_size = 256*4+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + } else if (src2t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1753,6 +1806,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break; case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; + case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; default: GGML_ASSERT(false && "not implemented"); } @@ -2183,9 +2237,9 @@ static bool ggml_metal_graph_compute( } } -#ifndef GGML_METAL_NDEBUG - [encoder popDebugGroup]; -#endif + if (should_capture) { + [encoder popDebugGroup]; + } } [encoder endEncoding]; @@ -2207,6 +2261,10 @@ static bool ggml_metal_graph_compute( } } + if (should_capture) { + [[MTLCaptureManager sharedCaptureManager] stopCapture]; + } + return true; } @@ -2578,6 +2636,13 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)]; } +void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) { + GGML_ASSERT(ggml_backend_is_metal(backend)); + + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + ctx->should_capture_next_compute = true; +} + GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) { diff --git a/ggml-metal.metal b/ggml-metal.metal index 029578dc5..2614d82e8 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2459,6 +2459,12 @@ typedef struct { } block_iq2_xs; // 74 bytes / block for QK_K = 256, so 2.3125 bpw +typedef struct { + half d; + uint8_t qs[3*QK_K/8]; +} block_iq3_xxs; +// 98 bytes / block for QK_K = 256, so 3.0625 bpw + //====================================== dot products ========================= void kernel_mul_mv_q2_K_f32_impl( @@ -3681,6 +3687,42 @@ constexpr constant static uint64_t iq2xs_grid[512] = { 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, }; +constexpr constant static uint32_t iq3xxs_grid[256] = { + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, +}; + + constexpr constant static uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -3970,6 +4012,143 @@ kernel void kernel_mul_mv_iq2_xs_f32( kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } +void kernel_mul_mv_iq3_xxs_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values; + threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 256); + { + int nval = 4; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) values[pos + i] = iq3xxs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + +#if QK_K == 256 + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + for (int i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq3_xxs * xr = x + ibl; + device const uint8_t * q3 = xr->qs + 8 * ib; + device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + const float db = dh[0]; + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = db * (0.5f + (aux32 >> 28)); + + float2 sum = {0}; + for (int l = 0; l < 4; ++l) { + const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + q3[2*l+0]); + const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + q3[2*l+1]); + const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + } + sumf[row] += d * (sum[0] + sum[1]); + + dh += nb*sizeof(block_iq3_xxs)/2; + q3 += nb*sizeof(block_iq3_xxs); + gas += nb*sizeof(block_iq3_xxs)/2; + } + + y4 += 32 * 32; + } +#else + // TODO +#endif + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f; + } + } +} + +[[host_name("kernel_mul_mv_iq3_xxs_f32")]] +kernel void kernel_mul_mv_iq3_xxs_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + + //============================= templates and their specializations ============================= // NOTE: this is not dequantizing - we are simply fitting the template @@ -4287,6 +4466,33 @@ void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 } } +template +void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * q3 = xb->qs + 8*ib32; + device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32; + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f; + constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]); + constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]); + uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127]; + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } + grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]); + grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]); + signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127]; + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f); + reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f); + } +} + template kernel void kernel_get_rows( device const void * src0, @@ -4828,6 +5034,7 @@ template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows; // // matrix-matrix multiplication @@ -4866,6 +5073,7 @@ template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication @@ -4916,6 +5124,7 @@ template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mu template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -5818,3 +6027,68 @@ kernel void kernel_mul_mv_id_iq2_xs_f32( tiisg, sgitg); } + +[[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] +kernel void kernel_mul_mv_id_iq3_xxs_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq3_xxs_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} diff --git a/ggml-quants.c b/ggml-quants.c index 7d2f033e9..8236385bc 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3441,6 +3441,41 @@ static const uint64_t iq2xs_grid[512] = { 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, }; +static const uint32_t iq3xxs_grid[256] = { + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, +}; + static const uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -3507,6 +3542,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, } } +// ====================== 3.0625 bpw (de)-quantization + +void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + uint32_t aux32; + + for (int i = 0; i < nb; i++) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + const uint8_t * scales_and_signs = qs + QK_K/4; + + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, scales_and_signs + 4*ib32, sizeof(uint32_t)); + const float db = d * (0.5f + (aux32 >> 28)) * 0.5f; + for (int l = 0; l < 4; ++l) { + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + qs[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + qs[2*l+1]); + for (int j = 0; j < 4; ++j) { + y[j+0] = db * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = db * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + y += 8; + } + qs += 8; + } + } +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -8458,17 +8525,36 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest const __m128i m4 = _mm_set1_epi8(0xf); const __m128i m1 = _mm_set1_epi8(1); - const __m128i m511 = _mm_set1_epi16(511); - const __m128i m127 = _mm_set1_epi16(127); + const __m256i m511 = _mm256_set1_epi16(511); + const __m256i mone = _mm256_set1_epi8(1); - const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + static const uint8_t k_bit_helper[32] = { + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00, + }; + static const char block_sign_shuffle_mask_1[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, + }; + static const char block_sign_shuffle_mask_2[32] = { + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, + 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, + }; + static const uint8_t bit_selector_mask_bytes[32] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper); + const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes); + const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1); + const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2); uint64_t aux64; // somewhat hacky, but gives a significant boost in performance - __m128i aux_gindex, aux_sindex; + __m256i aux_gindex; const uint16_t * gindex = (const uint16_t *)&aux_gindex; - const uint16_t * sindex = (const uint16_t *)&aux_sindex; __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { @@ -8483,26 +8569,68 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest __m256i sumi1 = _mm256_setzero_si256(); __m256i sumi2 = _mm256_setzero_si256(); - for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) { + + const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16; + aux_gindex = _mm256_and_si256(q2_data, m511); + + const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9); + const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13); + const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper); + + const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting); + const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits); + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8; - aux_gindex = _mm_and_si128(q2_data, m511); - aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127); - const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); - const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); - const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]); - const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]); - const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); - const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + + const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]], + iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]], + iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]); + const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]], + iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]); + const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]], + iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]); + + const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits); + const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1); + const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l); + const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h); + + __m256i signs; + signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone)); + + signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2); + signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask); + const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone)); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3); + const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4); const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); + const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2))); + const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3))); sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4)); } accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); @@ -8551,6 +8679,136 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest #endif } +// TODO +void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_iq3_xxs * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + ggml_int8x16x4_t q3s; + ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict gas = x[i].qs + QK_K/4; + const int8_t * restrict q8 = y[i].qs; + float sumf1 = 0, sumf2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t); + const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]}; + const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]}; + const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]}; + const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]}; + q3 += 16; + q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127)))); + q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127)))); + q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127)))); + q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127)))); + q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0)); + q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1)); + q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2)); + q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3)); + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); + sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28)); + sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28)); + } + sumf += d*(sumf1 + sumf2); + } + *s = 0.5f * sumf; + +#elif defined(__AVX2__) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint32_t aux32[2]; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict gas = x[i].qs + QK_K/4; + const int8_t * restrict q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]], + iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]); + q3 += 8; + memcpy(aux32, gas, 8); gas += 8; + const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127], + signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]); + const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127], + signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = aux32[0] >> 28; + const uint16_t ls2 = aux32[1] >> 28; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.25f * hsum_float_8(accumf); + +#else + + uint32_t aux32; + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict q3 = x[i].qs; + const uint8_t * restrict gas = x[i].qs + QK_K/4; + const int8_t * restrict q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t); + const uint32_t ls = 2*(aux32 >> 28) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]); + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127]; + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + q3 += 8; + bsum += sumi * ls; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +#endif +} + // ================================ IQ2 quantization ============================================= typedef struct { @@ -9189,3 +9447,436 @@ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, i return nrow * nblock * sizeof(block_iq2_xs); } +// +// ============================================= 3-bit using D4 lattice +// + +typedef struct { + uint32_t * grid; + int * map; + uint16_t * neighbours; +} iq3_entry_t; + +static iq3_entry_t iq3_data[1] = { + {NULL, NULL, NULL}, +}; + +static inline int iq3_data_index(int grid_size) { + (void)grid_size; + GGML_ASSERT(grid_size == 256); + return 0; +} + +static int iq3_compare_func(const void * left, const void * right) { + const int * l = (const int *)left; + const int * r = (const int *)right; + return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0; +} + +void iq3xs_init_impl(int grid_size) { + const int gindex = iq3_data_index(grid_size); + if (iq3_data[gindex].grid) { + return; + } + static const uint16_t kgrid_256[256] = { + 0, 2, 4, 9, 11, 15, 16, 18, 25, 34, 59, 61, 65, 67, 72, 74, + 81, 85, 88, 90, 97, 108, 120, 128, 130, 132, 137, 144, 146, 153, 155, 159, + 169, 175, 189, 193, 199, 200, 202, 213, 248, 267, 287, 292, 303, 315, 317, 321, + 327, 346, 362, 413, 436, 456, 460, 462, 483, 497, 513, 515, 520, 522, 529, 531, + 536, 538, 540, 551, 552, 576, 578, 585, 592, 594, 641, 643, 648, 650, 657, 664, + 698, 704, 706, 720, 729, 742, 758, 769, 773, 808, 848, 852, 870, 889, 901, 978, + 992, 1024, 1026, 1033, 1035, 1040, 1042, 1046, 1049, 1058, 1089, 1091, 1093, 1096, 1098, 1105, + 1112, 1139, 1143, 1144, 1152, 1154, 1161, 1167, 1168, 1170, 1183, 1184, 1197, 1217, 1224, 1228, + 1272, 1276, 1309, 1323, 1347, 1367, 1377, 1404, 1473, 1475, 1486, 1509, 1537, 1544, 1546, 1553, + 1555, 1576, 1589, 1594, 1600, 1602, 1616, 1625, 1636, 1638, 1665, 1667, 1672, 1685, 1706, 1722, + 1737, 1755, 1816, 1831, 1850, 1856, 1862, 1874, 1901, 1932, 1950, 1971, 2011, 2032, 2052, 2063, + 2077, 2079, 2091, 2095, 2172, 2192, 2207, 2208, 2224, 2230, 2247, 2277, 2308, 2345, 2356, 2389, + 2403, 2424, 2501, 2504, 2506, 2520, 2570, 2593, 2616, 2624, 2630, 2646, 2669, 2700, 2714, 2746, + 2754, 2795, 2824, 2835, 2839, 2874, 2882, 2905, 2984, 3028, 3042, 3092, 3108, 3110, 3124, 3153, + 3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610, + 3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992, + }; + const int kmap_size = 4096; + const int nwant = 2; + const uint16_t * kgrid = kgrid_256; + uint32_t * kgrid_q3xs; + int * kmap_q3xs; + uint16_t * kneighbors_q3xs; + + printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); + uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t)); + for (int k = 0; k < grid_size; ++k) { + int8_t * pos = (int8_t *)(the_grid + k); + for (int i = 0; i < 4; ++i) { + int l = (kgrid[k] >> 3*i) & 0x7; + pos[i] = 2*l + 1; + } + } + kgrid_q3xs = the_grid; + iq3_data[gindex].grid = the_grid; + kmap_q3xs = (int *)malloc(kmap_size*sizeof(int)); + iq3_data[gindex].map = kmap_q3xs; + for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1; + uint32_t aux32; + uint8_t * aux8 = (uint8_t *)&aux32; + for (int i = 0; i < grid_size; ++i) { + aux32 = kgrid_q3xs[i]; + uint16_t index = 0; + for (int k=0; k<4; ++k) { + uint16_t q = (aux8[k] - 1)/2; + index |= (q << 3*k); + } + kmap_q3xs[index] = i; + } + int8_t pos[4]; + int * dist2 = (int *)malloc(2*grid_size*sizeof(int)); + int num_neighbors = 0, num_not_in_map = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) continue; + ++num_not_in_map; + for (int k = 0; k < 4; ++k) { + int l = (i >> 3*k) & 0x7; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + int n = 0; int d2 = dist2[0]; + int nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + ++n; + } + num_neighbors += n; + } + printf("%s: %d neighbours in total\n", __func__, num_neighbors); + kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); + iq3_data[gindex].neighbours = kneighbors_q3xs; + int counter = 0; + for (int i = 0; i < kmap_size; ++i) { + if (kmap_q3xs[i] >= 0) continue; + for (int k = 0; k < 4; ++k) { + int l = (i >> 3*k) & 0x7; + pos[k] = 2*l + 1; + } + for (int j = 0; j < grid_size; ++j) { + const int8_t * pg = (const int8_t *)(kgrid_q3xs + j); + int d2 = 0; + for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]); + dist2[2*j+0] = d2; + dist2[2*j+1] = j; + } + qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func); + kmap_q3xs[i] = -(counter + 1); + int d2 = dist2[0]; + uint16_t * start = &kneighbors_q3xs[counter++]; + int n = 0, nhave = 1; + for (int j = 0; j < grid_size; ++j) { + if (dist2[2*j] > d2) { + if (nhave == nwant) break; + d2 = dist2[2*j]; + ++nhave; + } + kneighbors_q3xs[counter++] = dist2[2*j+1]; + ++n; + } + *start = n; + } + free(dist2); +} + +void iq3xs_free_impl(int grid_size) { + GGML_ASSERT(grid_size == 256); + const int gindex = iq3_data_index(grid_size); + if (iq3_data[gindex].grid) { + free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL; + free(iq3_data[gindex].map); iq3_data[gindex].map = NULL; + free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL; + } +} + +static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid, + const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) { + int num_neighbors = neighbours[0]; + GGML_ASSERT(num_neighbors > 0); + float best_d2 = FLT_MAX; + int grid_index = -1; + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float d2 = 0; + for (int i = 0; i < 4; ++i) { + float q = pg[i]; + float diff = scale*q - xval[i]; + d2 += weight[i]*diff*diff; + } + if (d2 < best_d2) { + best_d2 = d2; grid_index = neighbours[j]; + } + } + GGML_ASSERT(grid_index >= 0); + const int8_t * pg = (const int8_t *)(grid + grid_index); + for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2; + return grid_index; +} + +static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { + + const int gindex = iq3_data_index(256); + + const uint32_t * kgrid_q3xs = iq3_data[gindex].grid; + const int * kmap_q3xs = iq3_data[gindex].map; + const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours; + + //GGML_ASSERT(quant_weights && "missing quantization weights"); + GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(n%QK_K == 0); + + const int kMaxQ = 8; + + const int nbl = n/256; + + block_iq3_xxs * y = vy; + + float scales[QK_K/32]; + float weight[32]; + float xval[32]; + int8_t L[32]; + int8_t Laux[32]; + float waux[32]; + bool is_on_grid[8]; + bool is_on_grid_aux[8]; + uint8_t block_signs[8]; + uint8_t q3[3*(QK_K/8)]; + uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4); + + for (int ibl = 0; ibl < nbl; ++ibl) { + + y[ibl].d = GGML_FP32_TO_FP16(0.f); + memset(q3, 0, 3*QK_K/8); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = sumx2/QK_K; + + for (int ib = 0; ib < QK_K/32; ++ib) { + const float * xb = xbl + 32*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i]; + } + for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]); + for (int k = 0; k < 4; ++k) { + int nflip = 0; + uint8_t s = 0; + for (int i = 0; i < 8; ++i) { + if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i]; + else { + xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i); + } + } + if (nflip%2) { + int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin]; + for (int i = 1; i < 8; ++i) { + float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i]; + if (ax < min) { + min = ax; imin = i; + } + } + xval[8*k+imin] = -xval[8*k+imin]; + s ^= (1 << imin); + } + block_signs[k] = s & 127; + } + float max = xval[0]; + for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]); + if (!max) { + scales[ib] = 0; + memset(L, 0, 32); + continue; + } + float best = 0; + float scale = max/(2*kMaxQ-1); + for (int is = -15; is <= 15; ++is) { + float id = (2*kMaxQ-1+is*0.2f)/max; + float this_scale = 1/id; + for (int k = 0; k < 8; ++k) { + for (int i = 0; i < 4; ++i) { + int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); + Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l)); + } + uint16_t u = 0; + for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i); + int grid_index = kmap_q3xs[u]; + is_on_grid_aux[k] = true; + if (grid_index < 0) { + is_on_grid_aux[k] = false; + const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1; + grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 32; ++i) { + float w = weight[i]; + float q = 2*Laux[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + scale = sumqx/sumq2; best = scale*sumqx; + for (int i = 0; i < 32; ++i) L[i] = Laux[i]; + for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k]; + } + } + int n_not_ongrid = 0; + for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid; + if (n_not_ongrid > 0 && scale > 0) { + float id = 1/scale; + for (int k = 0; k < 8; ++k) { + if (is_on_grid[k]) continue; + uint16_t u = 0; + for (int i = 0; i < 4; ++i) { + int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); + l = MAX(0, MIN(kMaxQ-1, l)); + u |= (l << 3*i); + } + int grid_index = kmap_q3xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1; + grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k); + } + const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index); + for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2; + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 32; ++i) { + float w = weight[i]; + float q = 2*L[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0) scale = sumqx/sumq2; + } + if (scale < 0) { + // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale) + // and correspondingly flip quant signs. + scale = -scale; + for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127; + } + for (int k = 0; k < 8; ++k) { + uint16_t u = 0; + for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i); + int grid_index = kmap_q3xs[u]; + if (grid_index < 0) { + printf("Oops: found point %u not on grid:", u); + for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); + printf("\n"); + GGML_ASSERT(false); + } + q3[8*ib+k] = grid_index; + } + scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21); + GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + memset(y[ibl].qs, 0, 3*QK_K/8); + continue; + } + + float d = max_scale/31; + y[ibl].d = GGML_FP32_TO_FP16(d); + float id = 1/d; + float sumqx = 0, sumq2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib]-1)); + l = MAX(0, MIN(15, l)); + scales_and_signs[ib] |= ((uint32_t)l << 28); + if (false) { + const float * xb = xbl + 32*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + 32*ib; + for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i]; + } + const float db = 0.25f * d * (1 + 2*l); + for (int k = 0; k < 8; ++k) { + const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2); + const float * xk = xb + 4*k; + const float * wk = weight + 4*k; + //const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]); + const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]); + float best_mse = 0; int best_index = q3[8*ib+k]; + for (int j = 0; j < 4; ++j) { + float diff = db * grid[j] * signs[j] - xk[j]; + best_mse += wk[j] * diff * diff; + } + for (int idx = 0; idx < 256; ++idx) { + //grid = (const uint8_t *)(kgrid_q3xs + idx); + grid = (const uint8_t *)(iq3xxs_grid + idx); + float mse = 0; + for (int j = 0; j < 4; ++j) { + float diff = db * grid[j] * signs[j] - xk[j]; + mse += wk[j] * diff * diff; + } + if (mse < best_mse) { + best_mse = mse; best_index = idx; + } + } + q3[8*ib+k] = best_index; + //grid = (const uint8_t *)(kgrid_q3xs + best_index); + grid = (const uint8_t *)(iq3xxs_grid + best_index); + for (int j = 0; j < 4; ++j) { + float q = db * grid[j] * signs[j]; + sumqx += wk[j] * q * xk[j]; + sumq2 += wk[j] * q * q; + } + } + if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2); + } + } + memcpy(y[ibl].qs, q3, 3*QK_K/8); + } +} + +size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights); + src += n_per_row; + qrow += nblock*sizeof(block_iq3_xxs); + } + return nrow * nblock * sizeof(block_iq3_xxs); +} + +void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_iq3_xxs * restrict y = vy; + quantize_row_iq3_xxs_reference(x, y, k); +} + +void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) { + assert(k % QK_K == 0); + quantize_row_iq3_xxs_impl(x, y, k, NULL); +} diff --git a/ggml-quants.h b/ggml-quants.h index 7d7cf9178..5c9f63bd9 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -166,7 +166,7 @@ typedef struct { static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding"); // (Almost) "true" 2-bit quantization. -// Due to the need to use blocks as per ggml dsign, it ends up using +// Due to the need to use blocks as per ggml design, it ends up using // 2.0625 bpw because of the 16-bit scale for each block of 256. typedef struct { ggml_fp16_t d; @@ -182,6 +182,15 @@ typedef struct { } block_iq2_xs; static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); +// (Almost) "true" 3-bit quantization. +// Due to the need to use blocks as per ggml design, it ends up using +// 3.0625 bpw because of the 16-bit scale for each block of 256. +typedef struct { + ggml_fp16_t d; + uint8_t qs[3*QK_K/8]; +} block_iq3_xxs; +static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); + // Quantization void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); @@ -196,6 +205,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); +void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k); void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); @@ -210,6 +220,7 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); +void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); @@ -227,6 +238,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k); void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k); +void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); @@ -242,12 +254,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") // size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); @@ -260,3 +274,5 @@ size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, void iq2xs_init_impl(int grid_size); void iq2xs_free_impl(int grid_size); +void iq3xs_init_impl(int grid_size); +void iq3xs_free_impl(int grid_size); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 2c2b38fc3..1d93ec6bb 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -116,7 +116,7 @@ struct vk_device { vk_queue transfer_queue; uint32_t descriptor_set_mode; uint32_t subgroup_size; - bool is_igpu; + bool uma; }; struct vk_op_push_constants { @@ -675,7 +675,7 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_ vk::PhysicalDeviceMemoryProperties mem_props = vk_device.physical_device.getMemoryProperties(); - uint32_t memory_type_index = uint32_t(~0); + uint32_t memory_type_index = UINT32_MAX; for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) { vk::MemoryType memory_type = mem_props.memoryTypes[i]; @@ -685,7 +685,18 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_ } } - buf.device_memory = vk_device.device.allocateMemory({ mem_req.size, memory_type_index }); + if (memory_type_index >= mem_props.memoryTypeCount) { + throw vk::OutOfDeviceMemoryError("No suitable memory type found"); + } + + try { + buf.device_memory = vk_device.device.allocateMemory({ mem_req.size, memory_type_index }); + } catch (const vk::SystemError& e) { + // Out of Host/Device memory, clean up buffer + vk_device.device.destroyBuffer(buf.buffer); + buf.size = 0; + throw e; + } buf.memory_property_flags = req_flags; buf.ptr = nullptr; @@ -700,6 +711,47 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_ return buf; } +static vk_buffer ggml_vk_create_buffer_check(size_t size, vk::MemoryPropertyFlags req_flags) { + try { + return ggml_vk_create_buffer(size, req_flags); + } catch (const vk::SystemError& e) { + std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; + std::cerr << "ggml_vulkan: " << e.what() << std::endl; + throw e; + } +} + +static vk_buffer ggml_vk_create_buffer_device(size_t size) { + vk_buffer buf; + try { + buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal); + } catch (const vk::SystemError& e) { + if (vk_device.uma) { + // Fall back to host memory type + buf = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + } else { + std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; + std::cerr << "ggml_vulkan: " << e.what() << std::endl; + throw e; + } + } + + return buf; +} + +static void ggml_vk_destroy_buffer(vk_buffer& buf) { + if (buf.size == 0) { + return; + } +#ifdef VK_DEBUG + std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl; +#endif + + buf.size = 0; + vk_device.device.freeMemory(buf.device_memory); + vk_device.device.destroyBuffer(buf.buffer); +} + static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { return { buf, 0, VK_WHOLE_SIZE }; } @@ -738,19 +790,6 @@ static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vector(vk_pinned_memory[i]); + const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]); + if (ptr >= addr && ptr < endr) { + buf = &std::get<2>(vk_pinned_memory[i]); + buf_offset = ((const uint8_t *)ptr) - addr; + break; + } + } +} + static vk_submission ggml_vk_begin_submission(vk_queue& q, bool one_time = true) { vk_submission s; s.buffer = ggml_vk_create_cmd_buffer(q); @@ -1384,6 +1433,13 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect } } +static void ensure_sync_staging_buffer(size_t size) { + if (vk_sync_staging.size < size) { + ggml_vk_destroy_buffer(vk_sync_staging); + vk_sync_staging = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + } +} + static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { #ifdef VK_DEBUG std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl; @@ -1391,21 +1447,13 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size GGML_ASSERT(!ggml_is_contiguous(tensor)); // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl; + std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; GGML_ASSERT(false); } // Check if src is pinned memory - vk_buffer* buf = nullptr; - size_t buf_offset = 0; - for (size_t i = 0; i < vk_pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]); - if (tensor->data >= addr && tensor->data < endr) { - buf = &std::get<2>(vk_pinned_memory[i]); - buf_offset = ((const uint8_t *)tensor->data) - addr; - break; - } - } + vk_buffer * buf = nullptr; + size_t buf_offset; + ggml_vk_host_get(tensor->data, buf, buf_offset); const uint64_t ne0 = tensor->ne[0]; const uint64_t ne1 = tensor->ne[1]; @@ -1463,10 +1511,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size if (vk_staging.size < vk_staging_offset + copy_size) { if (sync_staging) { // Create temporary larger buffer - if (vk_sync_staging.size < copy_size) { - ggml_vk_destroy_buffer(vk_sync_staging); - vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); - } + ensure_sync_staging_buffer(copy_size); staging = &vk_sync_staging; staging_offset = 0; @@ -1512,17 +1557,9 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size GGML_ASSERT(false); } // Check if src is pinned memory - vk_buffer* buf = nullptr; - size_t buf_offset = 0; - for (size_t i = 0; i < vk_pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]); - if (src >= addr && src < endr) { - buf = &std::get<2>(vk_pinned_memory[i]); - buf_offset = ((const uint8_t *)src) - addr; - break; - } - } + vk_buffer * buf = nullptr; + size_t buf_offset; + ggml_vk_host_get(src, buf, buf_offset); if (buf != nullptr) { // Memory is pinned, use as staging buffer @@ -1555,10 +1592,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size const size_t copy_size = width*height; if (vk_staging.size < vk_staging_offset + copy_size) { if (sync_staging) { - if (vk_sync_staging.size < copy_size) { - ggml_vk_destroy_buffer(vk_sync_staging); - vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); - } + ensure_sync_staging_buffer(copy_size); staging = &vk_sync_staging; staging_offset = 0; @@ -1633,17 +1667,9 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_ GGML_ASSERT(height > 0); GGML_ASSERT(src->size > 0); // Check if dst is pinned memory - vk_buffer* buf = nullptr; - size_t buf_offset = 0; - for (size_t i = 0; i < vk_pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]); - if (dst >= addr && dst < endr) { - buf = &std::get<2>(vk_pinned_memory[i]); - buf_offset = ((const uint8_t *)dst) - addr; - break; - } - } + vk_buffer * buf = nullptr; + size_t buf_offset; + ggml_vk_host_get(dst, buf, buf_offset); std::vector slices(1); if (width == spitch && width == dpitch) { @@ -1677,10 +1703,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_ if (vk_staging.size < vk_staging_offset + copy_size) { if (sync_staging) { // Create temporary larger buffer - if (vk_sync_staging.size < copy_size) { - ggml_vk_destroy_buffer(vk_sync_staging); - vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); - } + ensure_sync_staging_buffer(copy_size); staging = &vk_sync_staging; } else { @@ -1819,7 +1842,7 @@ static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offs static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { #ifdef VK_DEBUG - std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << aligned << ")"; + std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")"; #endif if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) { #ifdef VK_DEBUG @@ -2003,8 +2026,27 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; - const bool load_x = src0->backend != GGML_BACKEND_GPU; - const bool load_y = src1->backend != GGML_BACKEND_GPU; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + + vk_buffer * d_Qx = nullptr; + size_t qx_buf_offset = 0; + vk_buffer * d_Qy = nullptr; + size_t qy_buf_offset = 0; + + bool src0_uma = false; + bool src1_uma = false; + + if (vk_device.uma) { + ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + src0_uma = d_Qx != nullptr; + src1_uma = d_Qy != nullptr; + } + + const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; + const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); @@ -2034,32 +2076,24 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer* d_D = &extra->buffer_gpu; const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); - vk_buffer * d_Qx; - uint64_t qx_buf_offset = 0; - vk_buffer * d_Qy; - uint64_t qy_buf_offset = 0; vk_buffer* d_X; uint64_t x_buf_offset = 0; vk_buffer* d_Y; uint64_t y_buf_offset = 0; if (load_x) { d_Qx = &vk_prealloc_qx; - } else { + } else if (!src0_uma) { d_Qx = &extra_src0->buffer_gpu; qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); } if (load_y) { d_Qy = &vk_prealloc_qy; - } else { + } else if (!src1_uma) { d_Qy = &extra_src1->buffer_gpu; qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qy != nullptr); @@ -2178,8 +2212,27 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; - const bool load_x = src0->backend != GGML_BACKEND_GPU; - const bool load_y = src1->backend != GGML_BACKEND_GPU; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + + vk_buffer * d_Qx = nullptr; + size_t qx_buf_offset = 0; + vk_buffer * d_Qy = nullptr; + size_t qy_buf_offset = 0; + + bool src0_uma = false; + bool src1_uma = false; + + if (vk_device.uma) { + ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + src0_uma = d_Qx != nullptr; + src1_uma = d_Qy != nullptr; + } + + const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; + const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); @@ -2199,31 +2252,23 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer* d_D = &extra->buffer_gpu; const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_Qx; - uint32_t qx_buf_offset = 0; - vk_buffer* d_Qy; - uint32_t qy_buf_offset = 0; vk_buffer* d_X; uint64_t x_buf_offset = 0; vk_buffer* d_Y; uint64_t y_buf_offset = 0; if (load_x) { d_Qx = &vk_prealloc_qx; - } else { + } else if(!src1_uma) { d_Qx = &extra_src0->buffer_gpu; qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); } if (load_y) { d_Qy = &vk_prealloc_qy; - } else { + } else if(!src1_uma) { d_Qy = &extra_src1->buffer_gpu; qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qy != nullptr); @@ -2345,7 +2390,21 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor GGML_ASSERT(ne11 == 1); - const bool load_y = src1->backend != GGML_BACKEND_GPU; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + + vk_buffer * d_Qy = nullptr; + size_t qy_buf_offset = 0; + + bool src1_uma = false; + + if (vk_device.uma) { + ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + src1_uma = d_Qy != nullptr; + } + + const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; const uint64_t x_ne = ne00 * ne01 * ne02; const uint64_t y_ne = ne10 * ne11 * ne12; @@ -2355,22 +2414,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t d_sz = sizeof(float) * d_ne; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer* d_D = &extra->buffer_gpu; const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_Qx; + vk_buffer* d_Qx = &extra_src0->buffer_gpu; const uint64_t qx_buf_offset = extra_src0->offset; - vk_buffer* d_Qy; - uint64_t qy_buf_offset = 0; - d_Qx = &extra_src0->buffer_gpu; GGML_ASSERT(d_Qx != nullptr); if (load_y) { d_Qy = &vk_prealloc_qy; - } else { + } else if (!src1_uma) { d_Qy = &extra_src1->buffer_gpu; qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qx != nullptr); @@ -2430,7 +2482,21 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * GGML_ASSERT(ne11 == 1); - const bool load_y = src1->backend != GGML_BACKEND_GPU; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + + vk_buffer * d_Qy = nullptr; + size_t qy_buf_offset = 0; + + bool src1_uma = false; + + if (vk_device.uma) { + ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + src1_uma = d_Qy != nullptr; + } + + const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; const uint64_t d_ne = ne01 * ne11 * ne12; @@ -2441,18 +2507,11 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * const uint64_t qy_sz = ggml_nbytes(src1); const uint64_t d_sz = sizeof(float) * d_ne; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer* d_D = &extra->buffer_gpu; const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_Qx; + vk_buffer* d_Qx = &extra_src0->buffer_gpu; const uint64_t qx_buf_offset = extra_src0->offset; - vk_buffer* d_Qy; - uint64_t qy_buf_offset = 0; - d_Qx = &extra_src0->buffer_gpu; GGML_ASSERT(d_Qx != nullptr); if (load_y) { d_Qy = &vk_prealloc_qy; @@ -2709,7 +2768,8 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) { } #ifdef GGML_VULKAN_CHECK_RESULTS -void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); +static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); +static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor); #endif template @@ -2758,17 +2818,34 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm return; } - const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU; - const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + + vk_buffer * d_X = nullptr; + size_t x_buf_offset = 0; + vk_buffer * d_Y = nullptr; + size_t y_buf_offset = 0; + + bool src0_uma = false; + bool src1_uma = false; + + if (vk_device.uma) { + ggml_vk_host_get(src0->data, d_X, x_buf_offset); + src0_uma = d_X != nullptr; + if (use_src1) { + ggml_vk_host_get(src1->data, d_Y, y_buf_offset); + src1_uma = d_Y != nullptr; + } + } + + const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma; + const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma; uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, vk_device.properties.limits.minStorageBufferOffsetAlignment); uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, vk_device.properties.limits.minStorageBufferOffsetAlignment) : 0; uint64_t d_sz = ggml_type_size(dst->type) * ne0; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - // Workaround for tiny tensor inputs on ROPE if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > extra_src1->buffer_gpu.size) { y_sz = VK_WHOLE_SIZE; @@ -2778,20 +2855,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm GGML_ASSERT(d_D != nullptr); uint64_t d_buf_offset = (extra->offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT - vk_buffer* d_X = nullptr; - uint64_t x_buf_offset = 0; - vk_buffer* d_Y = nullptr; - uint64_t y_buf_offset = 0; if (transfer_src0) { d_X = &vk_prealloc_qx; - } else { + } else if(!src0_uma) { d_X = &extra_src0->buffer_gpu; x_buf_offset = extra_src0->offset; GGML_ASSERT(d_X != nullptr); } if (transfer_src1) { d_Y = &vk_prealloc_qy; - } else if (use_src1) { + } else if (use_src1 && !src1_uma) { d_Y = &extra_src1->buffer_gpu; y_buf_offset = extra_src1->offset; GGML_ASSERT(d_Y != nullptr); @@ -3148,13 +3221,13 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size if (vk_prealloc_split_k.size > 0) { ggml_vk_destroy_buffer(vk_prealloc_split_k); } - vk_prealloc_split_k = ggml_vk_create_buffer(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_prealloc_split_k = ggml_vk_create_buffer_check(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); } } - vk_buffer d_X = ggml_vk_create_buffer(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); - vk_buffer d_Y = ggml_vk_create_buffer(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); - vk_buffer d_D = ggml_vk_create_buffer(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_X = ggml_vk_create_buffer_check(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_Y = ggml_vk_create_buffer_check(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_D = ggml_vk_create_buffer_check(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne); Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne); @@ -3315,6 +3388,10 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1 if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { return; } + i0 = std::max(i0, 5); + i1 = std::max(i1, 5); + i2 = std::max(i2, 0); + i3 = std::max(i3, 0); fprintf(stderr, " "); for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { fprintf(stderr, "%7d ", idx1); @@ -3376,7 +3453,7 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); ggml_vk_ctx_begin(ctx); - vk_buffer buffer = ggml_vk_create_buffer(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer buffer = ggml_vk_create_buffer_check(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); ggml_vk_h2d_tensor_2d(ctx, &buffer, 0, tensor, 0, 0, ggml_nrows(tensor)); @@ -3439,7 +3516,7 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) { std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl; #endif // Check transfers are correct - vk_buffer buffer = ggml_vk_create_buffer(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer buffer = ggml_vk_create_buffer_check(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal); float * x; float * y; @@ -3666,7 +3743,7 @@ void ggml_vk_preallocate_buffers() { std::cerr << "qx_size: " << vk_prealloc_size_qx << " qy_size: " << vk_prealloc_size_qy << " x_size: " << vk_prealloc_size_x << " y_size: " << vk_prealloc_size_y << " split_k_size: " << vk_prealloc_size_split_k << std::endl; #endif #if defined(VK_RUN_TESTS) - vk_staging = ggml_vk_create_buffer(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + vk_staging = ggml_vk_create_buffer_check(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); ggml_vk_test_transfer(8192 * 1000, false); ggml_vk_test_transfer(8192 * 1000, true); @@ -3712,42 +3789,42 @@ void ggml_vk_preallocate_buffers() { if (vk_prealloc_qx.size > 0) { ggml_vk_destroy_buffer(vk_prealloc_qx); } - vk_prealloc_qx = ggml_vk_create_buffer(vk_prealloc_size_qx, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_prealloc_qx = ggml_vk_create_buffer_device(vk_prealloc_size_qx); } if (vk_prealloc_size_qy > 0 && vk_prealloc_qy.size < vk_prealloc_size_qy) { // Resize buffer if (vk_prealloc_qy.size > 0) { ggml_vk_destroy_buffer(vk_prealloc_qy); } - vk_prealloc_qy = ggml_vk_create_buffer(vk_prealloc_size_qy, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_prealloc_qy = ggml_vk_create_buffer_device(vk_prealloc_size_qy); } if (vk_prealloc_size_x > 0 && vk_prealloc_x.size < vk_prealloc_size_x) { // Resize buffer if (vk_prealloc_x.size > 0) { ggml_vk_destroy_buffer(vk_prealloc_x); } - vk_prealloc_x = ggml_vk_create_buffer(vk_prealloc_size_x, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_prealloc_x = ggml_vk_create_buffer_device(vk_prealloc_size_x); } if (vk_prealloc_size_y > 0 && vk_prealloc_y.size < vk_prealloc_size_y) { // Resize buffer if (vk_prealloc_y.size > 0) { ggml_vk_destroy_buffer(vk_prealloc_y); } - vk_prealloc_y = ggml_vk_create_buffer(vk_prealloc_size_y, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_prealloc_y = ggml_vk_create_buffer_device(vk_prealloc_size_y); } if (vk_prealloc_size_split_k > 0 && vk_prealloc_split_k.size < vk_prealloc_size_split_k) { // Resize buffer if (vk_prealloc_split_k.size > 0) { ggml_vk_destroy_buffer(vk_prealloc_split_k); } - vk_prealloc_split_k = ggml_vk_create_buffer(vk_prealloc_size_split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_prealloc_split_k = ggml_vk_create_buffer_device(vk_prealloc_size_split_k); } if (vk_staging_size > 0 && vk_staging.size < vk_staging_size) { // Resize buffer if (vk_staging.size > 0) { ggml_vk_destroy_buffer(vk_staging); } - vk_staging = ggml_vk_create_buffer(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + vk_staging = ggml_vk_create_buffer_check(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); } } @@ -4138,6 +4215,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; + ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; } @@ -4163,14 +4241,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; } - if (extra->offset + ggml_nbytes(tensor) > extra->buffer_gpu.size) { - std::cerr << "ERROR: Trying to assign tensor " << tensor << " outside of buffer size " << ctx->dev_buffer.size << " requested offset: " << extra->offset << " tensor size: " << ggml_nbytes(tensor) << std::endl; - if (tensor->view_src != nullptr) { - std::cerr << "view_src: " << tensor->view_src << " extra: " << tensor->view_src->extra << std::endl; - } - GGML_ASSERT(false); - } - tensor->backend = GGML_BACKEND_GPU; tensor->extra = extra; } @@ -4248,7 +4318,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer( #ifdef VK_DEBUG std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl; #endif - vk_buffer dev_buffer = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer dev_buffer = ggml_vk_create_buffer_device(size); ggml_backend_vk_buffer_context * ctx = new ggml_backend_vk_buffer_context(dev_buffer); @@ -4326,9 +4396,12 @@ GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffe } GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * ptr = ggml_vk_host_malloc(size); - - if (ptr == nullptr) { + void * ptr = nullptr; + try { + ptr = ggml_vk_host_malloc(size); + } catch (vk::SystemError& e) { + std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl; + std::cerr << "ggml_vulkan: " << e.what() << std::endl; // fallback to cpu buffer return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); } @@ -4389,7 +4462,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g #ifdef VK_DEBUG std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl; #endif - GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type"); + GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -4409,7 +4482,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c #ifdef VK_DEBUG std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl; #endif - GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type"); + GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -4429,7 +4502,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c #ifdef VK_DEBUG std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl; #endif - if (dst->buffer->buft == ggml_backend_vk_buffer_type() && ggml_backend_buffer_is_vk(src->buffer)) { + if ((dst->buffer->buft == ggml_backend_vk_buffer_type() || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; @@ -4499,7 +4572,6 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml bool ok = ggml_vk_compute_forward(¶ms, node); if (!ok) { - std::cerr << "Vulkan disable: " << vk_disable << std::endl; fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } #ifdef GGML_VULKAN_CHECK_RESULTS @@ -4665,7 +4737,7 @@ GGML_CALL int ggml_backend_vk_reg_devices() { // checks #ifdef GGML_VULKAN_CHECK_RESULTS -void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector& done, int level = 0) { +static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector& done, int level = 0) { if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) { return; } @@ -4683,10 +4755,14 @@ void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vectortype != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { return; } + i0 = std::max(i0, 5); + i1 = std::max(i1, 5); + i2 = std::max(i2, 0); + i3 = std::max(i3, 0); fprintf(stderr, " "); for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) { fprintf(stderr, "%7d ", idx1); @@ -4698,9 +4774,9 @@ void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, in if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) { float val; if (tensor->type == GGML_TYPE_F32) { - val = *(float *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); + val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); } else if (tensor->type == GGML_TYPE_F16) { - val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); + val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); } fprintf(stderr, "% 7.2f ", val); } else { @@ -4711,14 +4787,16 @@ void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, in } } -void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { +static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; if (tensor->backend == GGML_BACKEND_GPU) { const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); - ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queue); + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + + ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -4730,10 +4808,10 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl; } std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor->data, 5, 5, 0, 0); + ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0); std::cerr << std::endl; std::cerr << std::endl << "Result:" << std::endl; - ggml_vk_print_tensor_area(tensor, tensor->data, 5, 5, 1, 0); + ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0); std::cerr << std::endl; std::vector done; ggml_vk_print_graph_origin(tensor, done); @@ -4743,7 +4821,7 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { } } -void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) { +static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) { return; GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU); if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { @@ -4779,7 +4857,7 @@ void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; -void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) { +static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) { if (params->ith != 0) { return; } @@ -4796,8 +4874,9 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) ggml_tensor * src1 = tensor->src[1]; struct ggml_init_params iparams = { - .mem_size = 1024*1024*1024, - .mem_buffer = NULL, + /*.mem_size =*/ 1024*1024*1024, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, }; struct ggml_context * ctx = ggml_init(iparams); @@ -4829,7 +4908,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) { const int idx = i3*src0->ne[2] + i2; - ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1], vk_device.transfer_queue); + ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]); } } @@ -4842,7 +4921,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) if (offset + src0_size >= extra->buffer_gpu.size) { src0_size = extra->buffer_gpu.size - offset; } - ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size, vk_device.transfer_queue); + ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -4872,7 +4951,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) { const int idx = i3*src1->ne[2] + i2; - ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1], vk_device.transfer_queue); + ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]); } } @@ -4885,7 +4964,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) if (offset + src1_size >= extra->buffer_gpu.size) { src1_size = extra->buffer_gpu.size - offset; } - ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size, vk_device.transfer_queue); + ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -4969,7 +5048,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { tensor_clone = ggml_dup(ctx, src0_clone); - tensor_clone->type == tensor->type; + tensor_clone->type = tensor->type; } else { tensor_clone = ggml_cpy(ctx, src0_clone, src1_clone); } @@ -5046,7 +5125,7 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) tensor_size = extra->buffer_gpu.size - (extra->offset); } - ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queue); + ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size); } float first_error_result = -1.0f; diff --git a/ggml.c b/ggml.c index 5b37487f7..b2c8baaa8 100644 --- a/ggml.c +++ b/ggml.c @@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) { break; } GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); + GGML_ASSERT(false); return NULL; } return aligned_memory; @@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) { #endif #endif +inline static void * ggml_malloc(size_t size) { + if (size == 0) { + GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n"); + return NULL; + } + void * result = malloc(size); + if (result == NULL) { + GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); + GGML_ASSERT(false); + } + return result; +} + +// calloc +inline static void * ggml_calloc(size_t num, size_t size) { + if (num == 0 || size == 0) { + GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n"); + return NULL; + } + void * result = calloc(num, size); + if (result == NULL) { + GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); + GGML_ASSERT(false); + } + return result; +} + +#define GGML_MALLOC(size) ggml_malloc(size) +#define GGML_CALLOC(num, size) ggml_calloc(num, size) + +#define GGML_FREE(ptr) free(ptr) + #define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) @@ -599,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq2_xs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, + [GGML_TYPE_IQ3_XXS] = { + .type_name = "iq3_xxs", + .blck_size = QK_K, + .type_size = sizeof(block_iq3_xxs), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs, + .from_float = quantize_row_iq3_xxs, + .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference, + .vec_dot = ggml_vec_dot_iq3_xxs_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, @@ -2144,6 +2188,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break; case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break; case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; + case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7537,6 +7582,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: { ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7803,6 +7849,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: { ggml_compute_forward_add1_q_f32(params, src0, src1, dst); } break; @@ -7922,6 +7969,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: default: { GGML_ASSERT(false); @@ -10673,6 +10721,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: { ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; @@ -10852,6 +10901,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: default: { GGML_ASSERT(false); @@ -11048,6 +11098,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -11695,6 +11746,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -11771,6 +11823,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -15129,13 +15182,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) { size = ggml_hash_size(size); struct ggml_hash_set result; result.size = size; - result.keys = malloc(sizeof(struct ggml_tensor *) * size); + result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size); memset(result.keys, 0, sizeof(struct ggml_tensor *) * size); return result; } static void ggml_hash_set_free(struct ggml_hash_set hash_set) { - free(hash_set.keys); + GGML_FREE(hash_set.keys); } struct hash_map { @@ -15144,17 +15197,17 @@ struct hash_map { }; static struct hash_map * ggml_new_hash_map(size_t size) { - struct hash_map * result = malloc(sizeof(struct hash_map)); + struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map)); result->set = ggml_hash_set_new(size); - result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size); + result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size); memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size); return result; } static void ggml_hash_map_free(struct hash_map * map) { ggml_hash_set_free(map->set); - free(map->vals); - free(map); + GGML_FREE(map->vals); + GGML_FREE(map); } // gradient checkpointing @@ -18827,6 +18880,7 @@ void ggml_quantize_init(enum ggml_type type) { switch (type) { case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break; case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break; + case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; default: // nothing break; } @@ -19089,6 +19143,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; + case GGML_TYPE_IQ3_XXS: + { + GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); + } break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); @@ -19215,6 +19278,25 @@ struct gguf_context { void * data; }; +static size_t gguf_type_size(enum gguf_type type) { + GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT); + return GGUF_TYPE_SIZE[type]; +} + +static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) { + GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS); + GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT); + + for (uint32_t i = 0; i < info->n_dims; ++i) { + GGML_ASSERT(info->ne[i] > 0); + } + + // prevent overflow for total number of elements + GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]); + GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]); + GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]); +} + static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { const size_t n = fread(dst, 1, size, file); *offset += n; @@ -19227,8 +19309,17 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { bool ok = true; - ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); - ok = ok && gguf_fread_el(file, p->data, p->n, offset); + ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); + + // early exit if string length is invalid, prevents from integer overflow + if (p->n == SIZE_MAX) { + fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n); + return false; + } + + p->data = GGML_CALLOC(p->n + 1, 1); + + ok = ok && gguf_fread_el(file, p->data, p->n, offset); return ok; } @@ -19300,6 +19391,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } + // sanity-checks to prevent from integer/buffer overflows + + ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info)); + ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead()); + ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv)); + if (!ok) { fprintf(stderr, "%s: failed to read header\n", __func__); fclose(file); @@ -19310,7 +19407,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the kv pairs { - ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); + ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; @@ -19338,7 +19435,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p case GGUF_TYPE_ARRAY: { ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); switch (kv->value.arr.type) { case GGUF_TYPE_UINT8: @@ -19353,21 +19450,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p case GGUF_TYPE_FLOAT64: case GGUF_TYPE_BOOL: { - kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); - ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset); + // prevent from integer overflow in the malloc below + if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) { + fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); + fclose(file); + gguf_free(ctx); + return NULL; + } + + kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type)); + + ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); } break; case GGUF_TYPE_STRING: { - kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str)); + // prevent from integer overflow in the malloc below + if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) { + fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); + fclose(file); + gguf_free(ctx); + return NULL; + } + + kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str)); + for (uint64_t j = 0; j < kv->value.arr.n; ++j) { ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); } } break; case GGUF_TYPE_ARRAY: - case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ASSERT(false && "invalid type"); break; } } break; - case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + default: GGML_ASSERT(false && "invalid type"); } if (!ok) { @@ -19385,7 +19500,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the tensor infos { - ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; @@ -19396,12 +19511,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_str(file, &info->name, &offset); ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); + + ok = ok && (info->n_dims <= GGML_MAX_DIMS); + for (uint32_t j = 0; j < info->n_dims; ++j) { ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); } + ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + gguf_tensor_info_sanitize(info); + if (!ok) { fprintf(stderr, "%s: failed to read tensor info\n", __func__); fclose(file); @@ -19555,12 +19676,12 @@ void gguf_free(struct gguf_context * ctx) { struct gguf_kv * kv = &ctx->kv[i]; if (kv->key.data) { - free(kv->key.data); + GGML_FREE(kv->key.data); } if (kv->type == GGUF_TYPE_STRING) { if (kv->value.str.data) { - free(kv->value.str.data); + GGML_FREE(kv->value.str.data); } } @@ -19570,16 +19691,16 @@ void gguf_free(struct gguf_context * ctx) { for (uint64_t j = 0; j < kv->value.arr.n; ++j) { struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; if (str->data) { - free(str->data); + GGML_FREE(str->data); } } } - free(kv->value.arr.data); + GGML_FREE(kv->value.arr.data); } } } - free(ctx->kv); + GGML_FREE(ctx->kv); } if (ctx->infos) { @@ -19587,11 +19708,11 @@ void gguf_free(struct gguf_context * ctx) { struct gguf_tensor_info * info = &ctx->infos[i]; if (info->name.data) { - free(info->name.data); + GGML_FREE(info->name.data); } } - free(ctx->infos); + GGML_FREE(ctx->infos); } GGML_ALIGNED_FREE(ctx); @@ -19892,8 +20013,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].value.arr.type = type; ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]); - memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]); + ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type)); + memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type)); } void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) { @@ -19902,7 +20023,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str)); + ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str)); for (int i = 0; i < n; i++) { struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; str->n = strlen(data[i]); @@ -19929,19 +20050,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { case GGUF_TYPE_ARRAY: { if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { - const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *)); for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; } gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); - free((void *)data); + GGML_FREE((void *)data); } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { GGML_ASSERT(false && "nested arrays not supported"); } else { gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); } } break; - case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ASSERT(false && "invalid type"); break; } } } @@ -20017,7 +20138,7 @@ struct gguf_buf { static struct gguf_buf gguf_buf_init(size_t size) { struct gguf_buf buf = { - /*buf.data =*/ size == 0 ? NULL : malloc(size), + /*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size), /*buf.size =*/ size, /*buf.offset =*/ 0, }; @@ -20027,7 +20148,7 @@ static struct gguf_buf gguf_buf_init(size_t size) { static void gguf_buf_free(struct gguf_buf buf) { if (buf.data) { - free(buf.data); + GGML_FREE(buf.data); } } @@ -20108,7 +20229,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * case GGUF_TYPE_FLOAT64: case GGUF_TYPE_BOOL: { - gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type)); } break; case GGUF_TYPE_STRING: { @@ -20117,10 +20238,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } } break; case GGUF_TYPE_ARRAY: - case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ASSERT(false && "invalid type"); break; } } break; - case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + default: GGML_ASSERT(false && "invalid type"); } } @@ -20352,6 +20473,14 @@ int ggml_cpu_has_vulkan(void) { #endif } +int ggml_cpu_has_kompute(void) { +#if defined(GGML_USE_KOMPUTE) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_sycl(void) { #if defined(GGML_USE_SYCL) return 1; @@ -20361,7 +20490,8 @@ int ggml_cpu_has_sycl(void) { } int ggml_cpu_has_gpublas(void) { - return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl(); + return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || + ggml_cpu_has_sycl(); } int ggml_cpu_has_sse3(void) { diff --git a/ggml.h b/ggml.h index d697fd2bb..afc87b843 100644 --- a/ggml.h +++ b/ggml.h @@ -353,6 +353,7 @@ extern "C" { GGML_TYPE_Q8_K = 15, GGML_TYPE_IQ2_XXS = 16, GGML_TYPE_IQ2_XS = 17, + GGML_TYPE_IQ3_XXS = 18, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -389,6 +390,7 @@ extern "C" { GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors }; // available tensor operations: @@ -2264,6 +2266,7 @@ extern "C" { GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_vulkan (void); + GGML_API int ggml_cpu_has_kompute (void); GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_ssse3 (void); diff --git a/llama.cpp b/llama.cpp index 796aaa895..a490eeab2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2367,6 +2367,7 @@ struct llama_model_loader { case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; + case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2715,6 +2716,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small"; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XSS - 3.0625 bpw"; default: return "unknown, may not work"; } @@ -6876,11 +6878,6 @@ static int llama_decode_internal( n_threads = std::min(4, n_threads); } - const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; - if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) { - n_threads = 1; - } - #ifdef GGML_USE_MPI const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); @@ -9237,6 +9234,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } + } else if (name == "token_embd.weight") { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + new_type = GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = GGML_TYPE_Q4_K; + } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; @@ -9247,7 +9251,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K; ++qs.i_ffn_down; } - else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K; } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -9255,6 +9258,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } @@ -9292,6 +9298,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } + //else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + // if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K; + //} else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K @@ -9323,13 +9332,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { new_type = GGML_TYPE_Q5_K; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } @@ -9372,7 +9382,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty bool convert_incompatible_tensor = false; if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || - new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) { + new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || + new_type == GGML_TYPE_IQ3_XXS) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -9386,6 +9397,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty switch (new_type) { case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; @@ -9427,6 +9439,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } diff --git a/llama.h b/llama.h index 01b293e64..17d43d039 100644 --- a/llama.h +++ b/llama.h @@ -112,6 +112,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 34db9df98..7b6c17915 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -f2a9472b23cf27e672ed70a2a6eb078f7b060f18 +475cbad5c1c834e31e26a2283bc1413181644360 diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 775147d42..1d29070b6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1890,6 +1890,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ3_XXS, }; // unary ops @@ -1926,8 +1927,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3})); - for (ggml_type type : all_types) { - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1})); + for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { + for (ggml_type type_dst : all_types) { + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); + } } test_cases.emplace_back(new test_cont()); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 31a78c632..43df8022d 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -17,7 +17,9 @@ constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f; constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f; +constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f; constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f; +constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f; static const char* RESULT_STR[] = {"ok", "FAILED"}; @@ -135,18 +137,21 @@ int main(int argc, char * argv[]) { } const ggml_type ei = (ggml_type)i; + if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) { printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei)); continue; } printf("Testing %s\n", ggml_type_name((ggml_type) i)); + ggml_quantize_init(ei); if (qfns.from_float && qfns.to_float) { const float total_error = total_quantization_error(qfns, test_size, test_data.data()); const float max_quantization_error = - type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : - type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : MAX_QUANTIZATION_TOTAL_ERROR; + type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : + type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : + type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR; failed = !(total_error < max_quantization_error); num_failed += failed; if (failed || verbose) { @@ -161,7 +166,9 @@ int main(int argc, char * argv[]) { } const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data()); - failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR); + const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || + type == GGML_TYPE_IQ3_XXS ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR; + failed = !(vec_dot_error < max_allowed_error); num_failed += failed; if (failed || verbose) { printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error); diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 09d410b7f..8ec817344 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -278,6 +278,8 @@ int main(int argc, char * argv[]) { if (qfns.from_float && qfns.to_float) { printf("%s\n", ggml_type_name(type)); + ggml_quantize_init(type); + if (params.op_quantize_row_q_reference) { printf(" quantize_row_q_reference\n"); for (size_t size : params.test_sizes) {