From 39a41a53b067b159bb6633739296c51a1df3a3c4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Jul 2024 20:44:32 +0300 Subject: [PATCH 1/7] py : switch to snake_case ggml-ci --- README.md | 8 +++--- ci/run.sh | 4 +-- convert_hf_to_gguf.py | 10 +++---- convert_hf_to_gguf_update.py | 26 +++++++++---------- docs/HOWTO-add-model.md | 2 +- ...egacy-llama.py => convert_legacy_llama.py} | 0 ...=> convert_finetune_checkpoint_to_gguf.py} | 0 ...ple.py => json_schema_pydantic_example.py} | 0 examples/llava/MobileVLM-README.md | 14 +++++----- examples/llava/README.md | 20 +++++++------- ...uf.py => convert_image_encoder_to_gguf.py} | 0 .../{llava-surgery.py => llava_surgery.py} | 0 ...lava-surgery-v2.py => llava_surgery_v2.py} | 0 ...=> pydantic_models_to_grammar_examples.py} | 0 ...egex-to-grammar.py => regex_to_grammar.py} | 0 examples/{server-embd.py => server_embd.py} | 0 ...py => convert_train_checkpoint_to_gguf.py} | 0 gguf-py/README.md | 2 +- ...nvert-endian.py => gguf_convert_endian.py} | 0 .../scripts/{gguf-dump.py => gguf_dump.py} | 0 ...f-new-metadata.py => gguf_new_metadata.py} | 0 ...f-set-metadata.py => gguf_set_metadata.py} | 0 scripts/check-requirements.sh | 10 +++---- scripts/convert-gg.sh | 26 ------------------- scripts/pod-llama.sh | 14 +++++----- 25 files changed, 55 insertions(+), 81 deletions(-) rename examples/{convert-legacy-llama.py => convert_legacy_llama.py} (100%) rename examples/finetune/{convert-finetune-checkpoint-to-gguf.py => convert_finetune_checkpoint_to_gguf.py} (100%) rename examples/{json-schema-pydantic-example.py => json_schema_pydantic_example.py} (100%) rename examples/llava/{convert-image-encoder-to-gguf.py => convert_image_encoder_to_gguf.py} (100%) rename examples/llava/{llava-surgery.py => llava_surgery.py} (100%) rename examples/llava/{llava-surgery-v2.py => llava_surgery_v2.py} (100%) rename examples/{pydantic-models-to-grammar-examples.py => pydantic_models_to_grammar_examples.py} (100%) rename examples/{regex-to-grammar.py => regex_to_grammar.py} (100%) rename examples/{server-embd.py => server_embd.py} (100%) rename examples/train-text-from-scratch/{convert-train-checkpoint-to-gguf.py => convert_train_checkpoint_to_gguf.py} (100%) rename gguf-py/scripts/{gguf-convert-endian.py => gguf_convert_endian.py} (100%) rename gguf-py/scripts/{gguf-dump.py => gguf_dump.py} (100%) rename gguf-py/scripts/{gguf-new-metadata.py => gguf_new_metadata.py} (100%) rename gguf-py/scripts/{gguf-set-metadata.py => gguf_set_metadata.py} (100%) delete mode 100755 scripts/convert-gg.sh diff --git a/README.md b/README.md index 3569b2bbb..0cc1d10f4 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics -- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430 +- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430 - Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387 @@ -636,8 +636,8 @@ Building the program with BLAS support may lead to some performance improvements To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. -Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives. -It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face. +Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives. +It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face. ```bash # obtain the official LLaMA model weights and place them in ./models @@ -654,7 +654,7 @@ ls ./models python3 -m pip install -r requirements.txt # convert the model to ggml FP16 format -python3 convert-hf-to-gguf.py models/mymodel/ +python3 convert_hf_to_gguf.py models/mymodel/ # quantize the model to 4-bits (using Q4_K_M method) ./llama-quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M diff --git a/ci/run.sh b/ci/run.sh index 067ac405b..f03fd72ca 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -421,7 +421,7 @@ function gg_run_pythia_1_4b { (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf + python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" @@ -553,7 +553,7 @@ function gg_run_pythia_2_8b { (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf + python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bdc336644..7261c1736 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -404,7 +404,7 @@ class Model: return tokens, toktypes, tokpre - # NOTE: this function is generated by convert-hf-to-gguf-update.py + # NOTE: this function is generated by convert_hf_to_gguf_update.py # do not modify it manually! # ref: https://github.com/ggerganov/llama.cpp/pull/6920 # Marker: Start get_vocab_base_pre @@ -424,7 +424,7 @@ class Model: res = None - # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": @@ -499,9 +499,9 @@ class Model: logger.warning("**************************************************************************************") logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {chkhsh}") @@ -1161,7 +1161,7 @@ class FalconModel(Model): # So we rearrange them here,, so that we have n_head query weights # followed by n_head_kv key weights followed by n_head_kv value weights, # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert_hf_to_gguf.py if "query_key_value" in name: n_head = self.find_hparam(["num_attention_heads", "n_head"]) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 21a306255..344d034fc 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # This script downloads the tokenizer models of the specified models from Huggingface and -# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py +# generates the get_vocab_base_pre() function for convert_hf_to_gguf.py # # This is necessary in order to analyze the type of pre-tokenizer used by the model and # provide the necessary information to llama.cpp via the GGUF header in order to implement @@ -15,9 +15,9 @@ # - Add a new model to the "models" list # - Run the script with your huggingface token: # -# python3 convert-hf-to-gguf-update.py +# python3 convert_hf_to_gguf-update.py # -# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py +# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp @@ -37,7 +37,7 @@ from enum import IntEnum, auto from transformers import AutoTokenizer logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger("convert-hf-to-gguf-update") +logger = logging.getLogger("convert_hf_to_gguf-update") sess = requests.Session() @@ -56,10 +56,10 @@ if len(sys.argv) == 2: token = sys.argv[1] if not token.startswith("hf_"): logger.info("Huggingface token seems invalid") - logger.info("Usage: python convert-hf-to-gguf-update.py ") + logger.info("Usage: python convert_hf_to_gguf-update.py ") sys.exit(1) else: - logger.info("Usage: python convert-hf-to-gguf-update.py ") + logger.info("Usage: python convert_hf_to_gguf-update.py ") sys.exit(1) # TODO: add models here, base models preferred @@ -134,7 +134,7 @@ for model in models: logger.error(f"Failed to download model {model['name']}. Error: {e}") -# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: +# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function: src_ifs = "" for model in models: @@ -201,7 +201,7 @@ src_func = f""" res = None - # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf-update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! {src_ifs} @@ -210,9 +210,9 @@ src_func = f""" logger.warning("**************************************************************************************") logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") logger.warning("** There are 2 possible reasons for this:") - logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the model has not been added to convert_hf_to_gguf_update.py yet") logger.warning("** - the pre-tokenization config has changed upstream") - logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** Check your model files and convert_hf_to_gguf_update.py and update them accordingly.") logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") logger.warning("**") logger.warning(f"** chkhsh: {{chkhsh}}") @@ -226,7 +226,7 @@ src_func = f""" return res """ -convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") +convert_py_pth = pathlib.Path("convert_hf_to_gguf.py") convert_py = convert_py_pth.read_text(encoding="utf-8") convert_py = re.sub( r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", @@ -237,7 +237,7 @@ convert_py = re.sub( convert_py_pth.write_text(convert_py, encoding="utf-8") -logger.info("+++ convert-hf-to-gguf.py was updated") +logger.info("+++ convert_hf_to_gguf.py was updated") # generate tests for each tokenizer model @@ -343,6 +343,6 @@ logger.info("\nRun the following commands to generate the vocab files for testin for model in models: name = model["name"] - print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 + print(f"python3 convert_hf_to_gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 logger.info("\n") diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md index 3eec077ea..a2a816804 100644 --- a/docs/HOWTO-add-model.md +++ b/docs/HOWTO-add-model.md @@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M ### 1. Convert the model to GGUF This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. -Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format). +Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format). The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. diff --git a/examples/convert-legacy-llama.py b/examples/convert_legacy_llama.py similarity index 100% rename from examples/convert-legacy-llama.py rename to examples/convert_legacy_llama.py diff --git a/examples/finetune/convert-finetune-checkpoint-to-gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py similarity index 100% rename from examples/finetune/convert-finetune-checkpoint-to-gguf.py rename to examples/finetune/convert_finetune_checkpoint_to_gguf.py diff --git a/examples/json-schema-pydantic-example.py b/examples/json_schema_pydantic_example.py similarity index 100% rename from examples/json-schema-pydantic-example.py rename to examples/json_schema_pydantic_example.py diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index f6c619c87..06a65fba4 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -30,16 +30,16 @@ git clone https://huggingface.co/mtgv/MobileVLM-1.7B git clone https://huggingface.co/openai/clip-vit-large-patch14-336 ``` -2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: +2. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: ```sh -python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B +python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B ``` -3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: +3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: ```sh -python ./examples/llava/convert-image-encoder-to-gguf \ +python ./examples/llava/convert_image_encoder_to_gguf \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B/llava.projector \ --output-dir path/to/MobileVLM-1.7B \ @@ -47,17 +47,17 @@ python ./examples/llava/convert-image-encoder-to-gguf \ ``` ```sh -python ./examples/llava/convert-image-encoder-to-gguf \ +python ./examples/llava/convert_image_encoder_to_gguf \ -m path/to/clip-vit-large-patch14-336 \ --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ --output-dir path/to/MobileVLM-1.7B_V2 \ --projector-type ldpv2 ``` -4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF: +4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B +python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B ``` 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` diff --git a/examples/llava/README.md b/examples/llava/README.md index f4554de67..012451361 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -38,22 +38,22 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336 pip install -r examples/llava/requirements.txt ``` -3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: +3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: ```sh -python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b +python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b ``` -4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: +4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF: ```sh -python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b +python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b ``` -5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF: +5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: ```sh -python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown +python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown ``` Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. @@ -70,9 +70,9 @@ git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b pip install -r examples/llava/requirements.txt ``` -3) Use `llava-surgery-v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: ```console -python examples/llava/llava-surgery-v2.py -C -m ../llava-v1.6-vicuna-7b/ +python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/ ``` - you will find a llava.projector and a llava.clip file in your model directory @@ -86,13 +86,13 @@ curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.jso 5) Create the visual gguf model: ```console -python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision +python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision ``` - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP 6) Then convert the model to gguf format: ```console -python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown +python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown ``` 7) And finally we can run the llava cli using the 1.6 model version: diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert_image_encoder_to_gguf.py similarity index 100% rename from examples/llava/convert-image-encoder-to-gguf.py rename to examples/llava/convert_image_encoder_to_gguf.py diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava_surgery.py similarity index 100% rename from examples/llava/llava-surgery.py rename to examples/llava/llava_surgery.py diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava_surgery_v2.py similarity index 100% rename from examples/llava/llava-surgery-v2.py rename to examples/llava/llava_surgery_v2.py diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic_models_to_grammar_examples.py similarity index 100% rename from examples/pydantic-models-to-grammar-examples.py rename to examples/pydantic_models_to_grammar_examples.py diff --git a/examples/regex-to-grammar.py b/examples/regex_to_grammar.py similarity index 100% rename from examples/regex-to-grammar.py rename to examples/regex_to_grammar.py diff --git a/examples/server-embd.py b/examples/server_embd.py similarity index 100% rename from examples/server-embd.py rename to examples/server_embd.py diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py similarity index 100% rename from examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py rename to examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py diff --git a/gguf-py/README.md b/gguf-py/README.md index a04c22759..2d8d03eee 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -3,7 +3,7 @@ This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) (GGML Universal File) format. -See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) +See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) as an example for its usage. ## Installation diff --git a/gguf-py/scripts/gguf-convert-endian.py b/gguf-py/scripts/gguf_convert_endian.py similarity index 100% rename from gguf-py/scripts/gguf-convert-endian.py rename to gguf-py/scripts/gguf_convert_endian.py diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf_dump.py similarity index 100% rename from gguf-py/scripts/gguf-dump.py rename to gguf-py/scripts/gguf_dump.py diff --git a/gguf-py/scripts/gguf-new-metadata.py b/gguf-py/scripts/gguf_new_metadata.py similarity index 100% rename from gguf-py/scripts/gguf-new-metadata.py rename to gguf-py/scripts/gguf_new_metadata.py diff --git a/gguf-py/scripts/gguf-set-metadata.py b/gguf-py/scripts/gguf_set_metadata.py similarity index 100% rename from gguf-py/scripts/gguf-set-metadata.py rename to gguf-py/scripts/gguf_set_metadata.py diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh index 69a08c841..27f264999 100755 --- a/scripts/check-requirements.sh +++ b/scripts/check-requirements.sh @@ -97,9 +97,9 @@ check_requirements() { } check_convert_script() { - local py=$1 # e.g. ./convert-hf-to-gguf.py - local pyname=${py##*/} # e.g. convert-hf-to-gguf.py - pyname=${pyname%.py} # e.g. convert-hf-to-gguf + local py=$1 # e.g. convert_hf_to_gguf.py + local pyname=${py##*/} # e.g. convert_hf_to_gguf.py + pyname=${pyname%.py} # e.g. convert_hf_to_gguf info "$py: beginning check" @@ -166,9 +166,9 @@ if (( do_cleanup )); then rm -rf -- "$all_venv" fi -check_convert_script examples/convert-legacy-llama.py +check_convert_script examples/convert_legacy_llama.py for py in convert_*.py; do - # skip convert-hf-to-gguf-update.py + # skip convert_hf_to_gguf_update.py # TODO: the check is failing for some reason: # https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920 [[ $py == convert_hf_to_gguf_update.py ]] && continue diff --git a/scripts/convert-gg.sh b/scripts/convert-gg.sh deleted file mode 100755 index 8a0168432..000000000 --- a/scripts/convert-gg.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -set -e - -# LLaMA v1 -python3 examples/convert-legacy-llama.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16 -python3 examples/convert-legacy-llama.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16 -python3 examples/convert-legacy-llama.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16 -python3 examples/convert-legacy-llama.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16 - -# LLaMA v2 -python3 examples/convert-legacy-llama.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16 -python3 examples/convert-legacy-llama.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16 -python3 examples/convert-legacy-llama.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16 - -# Code Llama -python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16 -python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16 -python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16 - -# Falcon -python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1 -mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf - -python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1 -mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 0d6d4032d..6e56e1ed0 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -75,7 +75,7 @@ if [ "$1" -eq "1" ]; then cd /workspace/llama.cpp - python3 examples/convert-legacy-llama.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16 + python3 examples/convert_legacy_llama.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16 ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0 ./llama-quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k @@ -90,7 +90,7 @@ if [ "$1" -eq "2" ]; then cd /workspace/llama.cpp - python3 examples/convert-legacy-llama.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16 + python3 examples/convert_legacy_llama.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16 ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0 ./llama-quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k @@ -105,7 +105,7 @@ if [ "$1" -eq "3" ]; then cd /workspace/llama.cpp - python3 examples/convert-legacy-llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16 + python3 examples/convert_legacy_llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16 ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0 ./llama-quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k @@ -120,7 +120,7 @@ if [ "$1" -eq "4" ]; then cd /workspace/llama.cpp - python3 examples/convert-legacy-llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16 + python3 examples/convert_legacy_llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16 ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0 ./llama-quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k @@ -135,7 +135,7 @@ if [ "$1" -eq "5" ]; then cd /workspace/llama.cpp - python3 examples/convert-legacy-llama.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16 + python3 examples/convert_legacy_llama.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16 ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0 ./llama-quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k @@ -150,7 +150,7 @@ if [ "$1" -eq "6" ]; then cd /workspace/llama.cpp - python3 examples/convert-legacy-llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16 + python3 examples/convert_legacy_llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16 ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0 ./llama-quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k @@ -165,7 +165,7 @@ if [ "$1" -eq "7" ]; then cd /workspace/llama.cpp - python3 examples/convert-legacy-llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16 + python3 examples/convert_legacy_llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16 ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0 ./llama-quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k From d8f2da6b9fcd5a60b83ac4f0b626a72d969ab053 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Jul 2024 20:47:03 +0300 Subject: [PATCH 2/7] cont ggml-ci --- gguf-py/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gguf-py/README.md b/gguf-py/README.md index 2d8d03eee..bc46d6e1d 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -3,7 +3,7 @@ This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302) (GGML Universal File) format. -See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) +See [convert_hf_to_gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) as an example for its usage. ## Installation @@ -15,13 +15,13 @@ pip install gguf [examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. -[scripts/gguf-dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-dump.py) — Dumps a GGUF file's metadata to the console. +[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. -[scripts/gguf-set-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-set-metadata.py) — Allows changing simple metadata values in a GGUF file by key. +[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. -[scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files. +[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. -[scripts/gguf-new-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-new-metadata.py) — Copies a GGUF file with added/modified/removed metadata values. +[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. ## Development Maintainers who participate in development of this package are advised to install it in editable mode: From c172b322c2e915c3bbc8f49fce52d9bc23640565 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Jul 2024 22:28:19 +0300 Subject: [PATCH 3/7] cont ggml-ci --- ci/run.sh | 2 +- convert_hf_to_gguf.py | 2 +- convert_hf_to_gguf_update.py | 10 +++++----- docs/HOWTO-add-model.md | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index f03fd72ca..9703b77ce 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 { (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf + python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf model_f16="${path_models}/ggml-model-f16.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf" diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7261c1736..d1422c413 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1161,7 +1161,7 @@ class FalconModel(Model): # So we rearrange them here,, so that we have n_head query weights # followed by n_head_kv key weights followed by n_head_kv value weights, # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert_hf_to_gguf.py + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-gguf.py if "query_key_value" in name: n_head = self.find_hparam(["num_attention_heads", "n_head"]) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 344d034fc..e4165ae2d 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -15,7 +15,7 @@ # - Add a new model to the "models" list # - Run the script with your huggingface token: # -# python3 convert_hf_to_gguf-update.py +# python3 convert_hf_to_gguf_update.py # # - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py # - Update llama.cpp with the new pre-tokenizer if necessary @@ -37,7 +37,7 @@ from enum import IntEnum, auto from transformers import AutoTokenizer logging.basicConfig(level=logging.DEBUG) -logger = logging.getLogger("convert_hf_to_gguf-update") +logger = logging.getLogger("convert_hf_to_gguf_update") sess = requests.Session() @@ -56,10 +56,10 @@ if len(sys.argv) == 2: token = sys.argv[1] if not token.startswith("hf_"): logger.info("Huggingface token seems invalid") - logger.info("Usage: python convert_hf_to_gguf-update.py ") + logger.info("Usage: python convert_hf_to_gguf_update.py ") sys.exit(1) else: - logger.info("Usage: python convert_hf_to_gguf-update.py ") + logger.info("Usage: python convert_hf_to_gguf_update.py ") sys.exit(1) # TODO: add models here, base models preferred @@ -201,7 +201,7 @@ src_func = f""" res = None - # NOTE: if you get an error here, you need to update the convert_hf_to_gguf-update.py script + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script # or pull the latest version of the model from Huggingface # don't edit the hashes manually! {src_ifs} diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md index a2a816804..87093cedd 100644 --- a/docs/HOWTO-add-model.md +++ b/docs/HOWTO-add-model.md @@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M ### 1. Convert the model to GGUF This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. -Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format). +Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format). The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. From 3e3cc7102ff8c1ef2fcdf4897759958665398e16 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Jul 2024 22:36:36 +0300 Subject: [PATCH 4/7] cont : fix link --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d1422c413..ed5490593 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1161,7 +1161,7 @@ class FalconModel(Model): # So we rearrange them here,, so that we have n_head query weights # followed by n_head_kv key weights followed by n_head_kv value weights, # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-gguf.py + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py if "query_key_value" in name: n_head = self.find_hparam(["num_attention_heads", "n_head"]) From 902de8826b325afbdb5eda4c6895b6e588bede09 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 4 Jul 2024 16:08:15 -0400 Subject: [PATCH 5/7] gguf-py : use snake_case in scripts entrypoint export --- examples/json_schema_pydantic_example.py | 2 +- gguf-py/scripts/__init__.py | 17 ++++------------- scripts/check-requirements.sh | 2 +- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/examples/json_schema_pydantic_example.py b/examples/json_schema_pydantic_example.py index 2a24f8118..c7ca7b8d9 100644 --- a/examples/json_schema_pydantic_example.py +++ b/examples/json_schema_pydantic_example.py @@ -1,7 +1,7 @@ # Usage: #! ./llama-server -m some-model.gguf & #! pip install pydantic -#! python json-schema-pydantic-example.py +#! python json_schema_pydantic_example.py from pydantic import BaseModel, Extra, TypeAdapter from annotated_types import MinLen diff --git a/gguf-py/scripts/__init__.py b/gguf-py/scripts/__init__.py index 1ad45639a..f9d29cb69 100644 --- a/gguf-py/scripts/__init__.py +++ b/gguf-py/scripts/__init__.py @@ -1,13 +1,4 @@ -import os - -from importlib import import_module - - -os.environ["NO_LOCAL_GGUF"] = "TRUE" - -gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main -gguf_dump_entrypoint = import_module("scripts.gguf-dump").main -gguf_set_metadata_entrypoint = import_module("scripts.gguf-set-metadata").main -gguf_new_metadata_entrypoint = import_module("scripts.gguf-new-metadata").main - -del import_module, os +from .gguf_convert_endian import main as gguf_convert_endian_entrypoint +from .gguf_dump import main as gguf_dump_entrypoint +from .gguf_set_metadata import main as gguf_set_metadata_entrypoint +from .gguf_new_metadata import main as gguf_new_metadata_entrypoint diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh index 27f264999..48f924c02 100755 --- a/scripts/check-requirements.sh +++ b/scripts/check-requirements.sh @@ -97,7 +97,7 @@ check_requirements() { } check_convert_script() { - local py=$1 # e.g. convert_hf_to_gguf.py + local py=$1 # e.g. ./convert_hf_to_gguf.py local pyname=${py##*/} # e.g. convert_hf_to_gguf.py pyname=${pyname%.py} # e.g. convert_hf_to_gguf From 91deef460613255c815320b1c567628b189a50c5 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 4 Jul 2024 16:16:05 -0400 Subject: [PATCH 6/7] py : rename requirements for convert_legacy_llama.py Needed for scripts/check-requirements.sh --- examples/llava/requirements.txt | 2 +- requirements.txt | 2 +- requirements/requirements-convert_hf_to_gguf.txt | 2 +- requirements/requirements-convert_hf_to_gguf_update.txt | 2 +- ...t-legacy-llama.txt => requirements-convert_legacy_llama.txt} | 0 requirements/requirements-convert_llama_ggml_to_gguf.txt | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename requirements/{requirements-convert-legacy-llama.txt => requirements-convert_legacy_llama.txt} (100%) diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index 21149d6fe..4713f0a34 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -1,3 +1,3 @@ --r ../../requirements/requirements-convert-legacy-llama.txt +-r ../../requirements/requirements-convert_legacy_llama.txt pillow~=10.2.0 torch~=2.2.1 diff --git a/requirements.txt b/requirements.txt index 1eca1a13f..52456c2e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # Package versions must stay compatible across all top-level python scripts. # --r ./requirements/requirements-convert-legacy-llama.txt +-r ./requirements/requirements-convert_legacy_llama.txt -r ./requirements/requirements-convert_hf_to_gguf.txt -r ./requirements/requirements-convert_hf_to_gguf_update.txt diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt index a7112f396..653355c07 100644 --- a/requirements/requirements-convert_hf_to_gguf.txt +++ b/requirements/requirements-convert_hf_to_gguf.txt @@ -1,2 +1,2 @@ --r ./requirements-convert-legacy-llama.txt +-r ./requirements-convert_legacy_llama.txt torch~=2.2.1 diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt index a7112f396..653355c07 100644 --- a/requirements/requirements-convert_hf_to_gguf_update.txt +++ b/requirements/requirements-convert_hf_to_gguf_update.txt @@ -1,2 +1,2 @@ --r ./requirements-convert-legacy-llama.txt +-r ./requirements-convert_legacy_llama.txt torch~=2.2.1 diff --git a/requirements/requirements-convert-legacy-llama.txt b/requirements/requirements-convert_legacy_llama.txt similarity index 100% rename from requirements/requirements-convert-legacy-llama.txt rename to requirements/requirements-convert_legacy_llama.txt diff --git a/requirements/requirements-convert_llama_ggml_to_gguf.txt b/requirements/requirements-convert_llama_ggml_to_gguf.txt index e80c29012..afe2747d4 100644 --- a/requirements/requirements-convert_llama_ggml_to_gguf.txt +++ b/requirements/requirements-convert_llama_ggml_to_gguf.txt @@ -1 +1 @@ --r ./requirements-convert-legacy-llama.txt +-r ./requirements-convert_legacy_llama.txt From 117f7adbd9de97651a4c375e5fb34621fcc3f808 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 10 Jul 2024 15:23:12 +0300 Subject: [PATCH 7/7] ggml : remove K_QUANTS_PER_ITERATION (#8306) ggml-ci --- Makefile | 8 - README.md | 2 - ggml/CMakeLists.txt | 2 - ggml/src/CMakeLists.txt | 2 - ggml/src/ggml-cuda/dmmv.cu | 120 ++++--------- ggml/src/ggml-sycl/dmmv.cpp | 163 ++++++------------ ggml/src/ggml-sycl/presets.hpp | 6 - ggml/src/ggml-vulkan.cpp | 6 - ggml/src/vulkan-shaders/mul_mat_vec_base.comp | 2 - ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp | 12 +- ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp | 12 +- ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp | 32 +--- ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp | 25 +-- 13 files changed, 110 insertions(+), 282 deletions(-) diff --git a/Makefile b/Makefile index 2730d8b60..d2c0562a3 100644 --- a/Makefile +++ b/Makefile @@ -640,12 +640,6 @@ ifdef GGML_CUDA_DMMV_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 endif # GGML_CUDA_DMMV_F16 -ifdef GGML_CUDA_KQUANTS_ITER - MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) -else - MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 -endif - ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) else @@ -734,7 +728,6 @@ ifdef GGML_HIPBLAS GGML_CUDA_DMMV_X ?= 32 GGML_CUDA_MMV_Y ?= 1 - GGML_CUDA_KQUANTS_ITER ?= 2 MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA @@ -751,7 +744,6 @@ endif # GGML_HIP_UMA HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS)) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) - HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) ifdef GGML_CUDA_FORCE_DMMV HIPFLAGS += -DGGML_CUDA_FORCE_DMMV diff --git a/README.md b/README.md index 0cc1d10f4..3b5ff665f 100644 --- a/README.md +++ b/README.md @@ -521,7 +521,6 @@ Building the program with BLAS support may lead to some performance improvements | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. | | GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models | | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | - | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | @@ -582,7 +581,6 @@ Building the program with BLAS support may lead to some performance improvements |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### Vulkan diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 0d0d52d57..7080d65e6 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -113,8 +113,6 @@ option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels") set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels") option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) -set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING - "ggml: iters./thread per block for Q2_K/Q6_K") set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "ggml: max. batch size for using peer access") option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 08b71d410..a4894b7e3 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -297,7 +297,6 @@ if (GGML_CUDA) add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) - add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) if (GGML_CUDA_USE_GRAPHS) @@ -426,7 +425,6 @@ if (GGML_HIPBLAS) add_compile_definitions(GGML_USE_HIPBLAS) add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) - add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) if (GGML_HIP_UMA) add_compile_definitions(GGML_HIP_UMA) diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 174489e06..2775080dd 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -2,16 +2,7 @@ #include "dequantize.cuh" #include "convert.cuh" -#ifndef K_QUANTS_PER_ITERATION -#define K_QUANTS_PER_ITERATION 2 -#else -static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); -#endif - static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - const int row = blockIdx.x*blockDim.y + threadIdx.y; if (row > nrows) return; @@ -22,15 +13,15 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, float tmp = 0; // partial sum for thread in warp - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; // 0,1 - const int step = 16/K_QUANTS_PER_ITERATION; + const int step = 8; const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const int in = tid - step*im; // 0...15 or 0...7 - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int l0 = 2*in; // 0...14 in steps of 2 const int q_offset = 32*im + l0; const int s_offset = 8*im; const int y_offset = 128*im + l0; @@ -39,7 +30,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const uint8_t * d = (const uint8_t *)aux; const uint8_t * m = (const uint8_t *)(aux + 2); - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 2) { const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; @@ -54,7 +45,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, aux[3] = (a[1] >> 4) & 0x0f0f0f0f; float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + for (int l = 0; l < 2; ++l) { sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) @@ -94,17 +85,17 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const uint16_t kmask1 = 0x0303; const uint16_t kmask2 = 0x0f0f; - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + const int tid = threadIdx.x/2; // 0...16 + const int ix = threadIdx.x%2; // 0,1 - const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop - const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 + const int n = 2; // iterations in the inner loop + const int step = 8; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 const uint8_t m = 1 << (4*im); - const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 const int q_offset = 32*im + l0; const int y_offset = 128*im + l0; @@ -113,7 +104,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const uint16_t s_shift = 4*im; - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 2) { const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; @@ -163,14 +154,14 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + const int tid = threadIdx.x/2; // 0...16 + const int ix = threadIdx.x%2; // 0,1 - const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + const int step = 4; - const int il = tid/step; // 0...3 - const int ir = tid - step*il; // 0...7 or 0...3 - const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 4; const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const int in = il%2; @@ -182,17 +173,12 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; -#if K_QUANTS_PER_ITERATION == 2 uint32_t q32[4]; const uint8_t * q4 = (const uint8_t *)q32; -#else - uint16_t q16[4]; - const uint8_t * q4 = (const uint8_t *)q16; -#endif float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 2) { const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; @@ -206,7 +192,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); -#if K_QUANTS_PER_ITERATION == 2 const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); const uint32_t * q2 = q1 + 16; @@ -223,25 +208,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; } tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; -#else - const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); - const uint16_t * q2 = q1 + 32; - - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[0] & 0xf0f0; - q16[2] = q2[0] & 0x0f0f; - q16[3] = q2[0] & 0xf0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 2; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; - s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; -#endif - } // sum up partial sums and write back result @@ -341,9 +307,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, } static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - const int row = blockIdx.x*blockDim.y + threadIdx.y; if (row > nrows) return; @@ -352,21 +315,17 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const block_q6_K * x = (const block_q6_K *)vx + ib0; - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + const int tid = threadIdx.x/2; // 0...16 + const int ix = threadIdx.x%2; // 0, 1 - const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const int step = 8; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 -#if K_QUANTS_PER_ITERATION == 1 - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 - const int is = 0; -#else - const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int is = in / 4; -#endif + const int ql_offset = 64*im + l0; const int qh_offset = 32*im + l0; const int s_offset = 8*im + is; @@ -374,7 +333,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 2) { const float * y = yy + i * QK_K + y_offset; const uint8_t * ql = x[i].ql + ql_offset; @@ -383,17 +342,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float d = x[i].d; -#if K_QUANTS_PER_ITERATION == 1 - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp += sum; -#else float sum = 0; for (int l = 0; l < 4; ++l) { sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) @@ -402,8 +350,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); } tmp += sum; -#endif - } // sum up partial sums and write back result @@ -547,7 +493,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int ny = 2; const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(32, ny, 1); @@ -556,7 +502,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; + const int ny = 1; const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(32, ny, 1); @@ -565,7 +511,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; + const int ny = 1; const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(32, ny, 1); @@ -580,7 +526,7 @@ static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, f static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; + const int ny = 1; const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(block_num_y, 1, 1); const dim3 block_dims(32, ny, 1); diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 927819281..b5562c504 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -123,9 +123,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, float *__restrict__ dst, const int ncols, int nrows, const sycl::nd_item<3> &item_ct1) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); if (row > nrows) return; @@ -139,16 +136,16 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, #if QK_K == 256 const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + item_ct1.get_local_id(2) / 2; // 0...15 const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + item_ct1.get_local_id(2) % 2; // 0,1 - const int step = 16/K_QUANTS_PER_ITERATION; + const int step = 8; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int l0 = 2*in; // 0...14 in steps of 2 const int q_offset = 32*im + l0; const int s_offset = 8*im; const int y_offset = 128*im + l0; @@ -157,7 +154,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, const uint8_t * d = (const uint8_t *)aux; const uint8_t * m = (const uint8_t *)(aux + 2); - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 2) { const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; @@ -172,7 +169,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, aux[3] = (a[1] >> 4) & 0x0f0f0f0f; float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + for (int l = 0; l < 2; ++l) { sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) @@ -189,18 +186,15 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, } #else - const int tid = item_ct1.get_local_id(2) / - (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = item_ct1.get_local_id(2) % - (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; + const int tid = item_ct1.get_local_id(2) / 4; // 0...7 + const int ix = item_ct1.get_local_id(2) % 4; // 0...3 + const int offset = tid * 2; uint32_t uaux[2]; const uint8_t * d = (const uint8_t *)uaux; - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - + for (int i = ix; i < num_blocks_per_row; i += 4) { const float * y = yy + i * QK_K + offset; const uint8_t * q = x[i].qs + offset; const uint32_t * s = (const uint32_t *)x[i].scales; @@ -212,7 +206,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, x[i].dm.convert(); float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + for (int l = 0; l < 2; ++l) { const uint8_t ql = q[l]; sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + y[l+16] * d[1] * ((ql >> 2) & 3) @@ -267,14 +261,14 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, const uint16_t kmask2 = 0x0f0f; const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / 2; // 0...16 const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + item_ct1.get_local_id(2) % 2; // 0,1 - const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop - const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 + const int n = 2; // iterations in the inner loop + const int step = 8; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 const uint8_t m = 1 << (4*im); @@ -287,7 +281,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, const uint16_t s_shift = 4*im; - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 2) { const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; @@ -317,13 +311,13 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, } #else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 - const int in = offset/8; // 0 or 1 - const int im = offset%8; // 0...7 + const int tid = item_ct1.get_local_id(2)/4; // 0...7 + const int ix = item_ct1.get_local_id(2)%4; // 0...3 + const int offset = tid * 2; // 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 4) { const float * y = yy + i * QK_K + offset; const uint8_t * q = x[i].qs + offset; @@ -332,7 +326,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, const float dall = (float)x[i].d; float sum = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + for (int l = 0; l < 2; ++l) { const uint8_t hl = x[i].hmask[im+l] >> in; const uint8_t ql = q[l]; sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) @@ -383,15 +377,15 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const uint16_t kmask3 = 0xc0c0; const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / 2; // 0...16 const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + item_ct1.get_local_id(2) % 2; // 0,1 - const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + const int step = 4; const int il = tid/step; // 0...3 const int ir = tid - step*il; // 0...7 or 0...3 - const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + const int n = 4; const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const int in = il%2; @@ -403,17 +397,12 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; -#if K_QUANTS_PER_ITERATION == 2 uint32_t q32[4]; const uint8_t * q4 = (const uint8_t *)q32; -#else - uint16_t q16[4]; - const uint8_t * q4 = (const uint8_t *)q16; -#endif float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 2) { const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; @@ -427,7 +416,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); -#if K_QUANTS_PER_ITERATION == 2 const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); const uint32_t * q2 = q1 + 16; @@ -446,38 +434,19 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - dmin * smin; -#else - const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); - const uint16_t * q2 = q1 + 32; - - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[0] & 0xf0f0; - q16[2] = q2[0] & 0x0f0f; - q16[3] = q2[0] & 0xf0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 2; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; - s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; -#endif - } #else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); + const int tid = item_ct1.get_local_id(2)/4; // 0...15 + const int ix = item_ct1.get_local_id(2)%4; - const int step = tid * K_QUANTS_PER_ITERATION; + const int step = tid * 2; uint16_t aux16[2]; const uint8_t * s = (const uint8_t *)aux16; float tmp = 0; - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 4) { const uint8_t * q = x[i].qs + step; const float * y = yy + i*QK_K + step; const uint16_t * a = (const uint16_t *)x[i].scales; @@ -486,7 +455,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const float d = (float)x[i].dm[0]; const float m = (float)x[i].dm[1]; float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + for (int j = 0; j < 2; ++j) { sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) @@ -608,19 +577,19 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, } #else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); - const int step = tid * K_QUANTS_PER_ITERATION; + const int tid = item_ct1.get_local_id(2)/4; // 0...15 + const int ix = item_ct1.get_local_id(2)%4; + const int step = tid * 2; const int im = step/8; const int in = step%8; - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 4) { const uint8_t * q = x[i].qs + step; const int8_t * s = x[i].scales; const float * y = yy + i*QK_K + step; const float d = x[i].d; float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + for (int j = 0; j < 2; ++j) { const uint8_t h = x[i].qh[in+j] >> im; sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) @@ -645,9 +614,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows, const sycl::nd_item<3> &item_ct1) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); if (row > nrows) return; @@ -660,22 +626,18 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa #if QK_K == 256 const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / 2; // 0...16 const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 + item_ct1.get_local_id(2) % 2; // 0, 1 - const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const int step = 8; const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const int in = tid - step*im; // 0...15 or 0...7 -#if K_QUANTS_PER_ITERATION == 1 - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 - const int is = 0; -#else const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int is = in / 4; -#endif + const int ql_offset = 64*im + l0; const int qh_offset = 32*im + l0; const int s_offset = 8*im + is; @@ -683,7 +645,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 2) { const float * y = yy + i * QK_K + y_offset; const uint8_t * ql = x[i].ql + ql_offset; @@ -692,17 +654,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa const float d = x[i].d; -#if K_QUANTS_PER_ITERATION == 1 - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp += sum; -#else float sum = 0; for (int l = 0; l < 4; ++l) { sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) @@ -711,20 +662,18 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); } tmp += sum; -#endif - } #else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3 + const int tid = item_ct1.get_local_id(2)/4; // 0...7 + const int ix = item_ct1.get_local_id(2)%4; // 0...3 - const int step = tid * K_QUANTS_PER_ITERATION; + const int step = tid * 2; float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + for (int i = ix; i < num_blocks_per_row; i += 4) { const float * y = yy + i * QK_K + step; const uint8_t * ql = x[i].ql + step; @@ -734,7 +683,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa const float d = x[i+0].d; float sum = 0; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + for (int j = 0; j < 2; ++j) { sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) @@ -870,7 +819,7 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int ny = 2; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, ny, WARP_SIZE); @@ -886,7 +835,7 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; + const int ny = 1; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, ny, WARP_SIZE); @@ -902,7 +851,7 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; + const int ny = 1; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, ny, WARP_SIZE); @@ -931,7 +880,7 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; + const int ny = 1; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, ny, WARP_SIZE); diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index c09c75dc7..2a7aeb609 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -50,12 +50,6 @@ #define GGML_SYCL_MMV_Y 1 #endif -#ifndef K_QUANTS_PER_ITERATION -#define K_QUANTS_PER_ITERATION 2 -#else -static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); -#endif - #ifndef GGML_SYCL_PEER_MAX_BATCH_SIZE #define GGML_SYCL_PEER_MAX_BATCH_SIZE 128 #endif // GGML_SYCL_PEER_MAX_BATCH_SIZE diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 101781ede..c6d227c66 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -44,12 +44,6 @@ #define MAX_VK_BUFFERS 256 -#ifndef K_QUANTS_PER_ITERATION -#define K_QUANTS_PER_ITERATION 1 -#else -static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); -#endif - #define VK_CHECK(err, msg) \ do { \ vk::Result err_ = (err); \ diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/vulkan-shaders/mul_mat_vec_base.comp index 5920bc936..cac4250fc 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_base.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_base.comp @@ -2,8 +2,6 @@ #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_8bit_storage : require -#define K_QUANTS_PER_ITERATION 2 - #ifdef MUL_MAT_ID #define EXPERT_COUNT 8 #endif diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp index 4cd97799d..0239c1192 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -15,22 +15,22 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + const uint tid = gl_LocalInvocationID.x/2; // 0...16 + const uint ix = gl_LocalInvocationID.x%2; // 0, 1 - const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const uint step = 8; const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const uint v_in = tid - step*v_im; // 0...15 or 0...7 - const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 + const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; const uint s_offset = 8*v_im; const uint y_offset = 128*v_im + l0; tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { const uint y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib0 + i].d.x); @@ -38,7 +38,7 @@ void main() { FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + for (int l = 0; l < 2; ++l) { sum1 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3) + FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3) + FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3) diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp index a6e430ea0..42c26a26e 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -15,17 +15,17 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + const uint tid = gl_LocalInvocationID.x/2; // 0...16 + const uint ix = gl_LocalInvocationID.x%2; // 0, 1 - const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const uint step = 8; const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const uint v_in = tid - step*v_im; // 0...15 or 0...7 const uint8_t m = uint8_t(1 << (4 * v_im)); - const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 + const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; const uint y_offset = 128*v_im + l0; @@ -33,13 +33,13 @@ void main() { const uint s_shift = 4 * v_im; - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { const uint y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); FLOAT_TYPE sum = FLOAT_TYPE(0.0); - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + for (int l = 0; l < 2; ++l) { sum += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)) + FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)) + FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)) diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp index 75569363c..a070567c5 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -15,14 +15,14 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + const uint tid = gl_LocalInvocationID.x/2; // 0...16 + const uint ix = gl_LocalInvocationID.x%2; // 0, 1 - const uint step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + const uint step = 4; const uint il = tid/step; // 0...3 const uint ir = tid - step*il; // 0...7 or 0...3 - const uint n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + const uint n = 4; const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const uint v_in = il % 2; @@ -33,7 +33,7 @@ void main() { tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; @@ -49,7 +49,6 @@ void main() { const uint8_t sc6 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 8] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 4] & 0xc0) >> 2)); const uint8_t sc7 = uint8_t(((data_a[ib0 + i].scales[v_im * 2 + 9] >> 4) & 0x0f) | ((data_a[ib0 + i].scales[v_im * 2 + 5] & 0xc0) >> 2)); -#if K_QUANTS_PER_ITERATION == 2 const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf); const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf); const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset + 2] & 0xf); @@ -78,27 +77,6 @@ void main() { + FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7 ); tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin); -#else - const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf); - const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf); - const uint8_t q4_2 = uint8_t(data_a[ib0 + i].qs[q_offset ] >> 4); - const uint8_t q4_3 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] >> 4); - const uint8_t q4_4 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] & 0xf); - const uint8_t q4_5 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] & 0xf); - const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4); - const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4); - - const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx ]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1); - const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3); - const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx ]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5); - const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7); - const FLOAT_TYPE smin = FLOAT_TYPE( - FLOAT_TYPE(data_b[b_offset + y1_idx]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7 - + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7 - ); - - tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin); -#endif } // sum up partial sums and write back result diff --git a/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp index d610cf030..a6bdaa52a 100644 --- a/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -15,21 +15,16 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + const uint tid = gl_LocalInvocationID.x/2; // 0...16 + const uint ix = gl_LocalInvocationID.x%2; // 0, 1 - const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const uint step = 8; const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const uint v_in = tid - step*v_im; // 0...15 or 0...7 -#if K_QUANTS_PER_ITERATION == 1 - const uint l0 = v_in; // 0...15 - const uint is = 0; -#else const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28 const uint is = v_in / 4; -#endif const uint ql_offset = 64*v_im + l0; const uint qh_offset = 32*v_im + l0; @@ -38,22 +33,11 @@ void main() { tmp[16 * ix + tid] = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { const uint y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); -#if K_QUANTS_PER_ITERATION == 1 - FLOAT_TYPE sum = FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32) - + FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32); - tmp[16 * ix + tid] += sum; -#else FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 4; ++l) { sum += FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32) @@ -62,7 +46,6 @@ void main() { + FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32); } tmp[16 * ix + tid] += sum; -#endif } // sum up partial sums and write back result