diff --git a/README-sycl.md b/README-sycl.md index 7aa4274a9..e3a8e726e 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -311,15 +311,13 @@ Output (example): a. Download & install cmake for Windows: https://cmake.org/download/ -b. Download & install make for Windows provided by mingw-w64 +b. Download & install mingw-w64 make for Windows provided by w64devkit -- Download binary package for Windows in https://github.com/niXman/mingw-builds-binaries/releases. +- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). - Like [x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z](https://github.com/niXman/mingw-builds-binaries/releases/download/13.2.0-rt_v11-rev1/x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z). +- Extract `w64devkit` on your pc. -- Unzip the binary package. In the **bin** sub-folder and rename **xxx-make.exe** to **make.exe**. - -- Add the **bin** folder path in the Windows system PATH environment. +- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`. ### Build locally: diff --git a/README.md b/README.md index 34f2021f9..0509b0ba1 100644 --- a/README.md +++ b/README.md @@ -33,17 +33,14 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
  • Get the Code
  • Build
  • BLAS Build
  • -
  • Prepare Data & Run
  • +
  • Prepare and Quantize
  • +
  • Run the quantized model
  • Memory/Disk Requirements
  • Quantization
  • Interactive mode
  • Constrained output with grammars
  • -
  • Instruction mode with Alpaca
  • -
  • Using OpenLLaMA
  • -
  • Using GPT4All
  • -
  • Using Pygmalion 7B & Metharme 7B
  • -
  • Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
  • -
  • Verifying the model files
  • +
  • Instruct mode
  • +
  • Obtaining and using the Facebook LLaMA 2 model
  • Seminal papers and background on the models
  • Perplexity (measuring model quality)
  • Android
  • @@ -83,20 +80,16 @@ improved significantly thanks to many contributions. It is the main playground f **Supported models:** +Typically finetunes of the base models below are supported as well. + - [X] LLaMA 🦙 - [x] LLaMA 2 🦙🦙 -- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) +- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [X] Falcon -- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) -- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) -- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) -- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy) -- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b) -- [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) @@ -149,6 +142,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [iohub/collama](https://github.com/iohub/coLLaMA) - [janhq/jan](https://github.com/janhq/jan) (AGPL) - [nat/openplayground](https://github.com/nat/openplayground) +- [Faraday](https://faraday.dev/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) @@ -165,7 +159,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: Here is a typical run using LLaMA v2 13B on M2 Ultra: -```java +``` $ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e I llama.cpp build info: I UNAME_S: Darwin @@ -249,7 +243,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8 ## Usage -Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model. +Here are the end-to-end binary build and model conversion steps for most supported models. ### Get the Code @@ -634,7 +628,7 @@ Building the program with BLAS support may lead to some performance improvements **Without docker**: - Firstly, you need to make sure you installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html) + Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html) For example, on Ubuntu 22.04 (jammy), use the command below: @@ -647,6 +641,8 @@ Building the program with BLAS support may lead to some performance improvements vulkaninfo ``` + Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead. + Then, build llama.cpp using the cmake command below: ```bash @@ -661,34 +657,42 @@ Building the program with BLAS support may lead to some performance improvements # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 ``` -### Prepare Data & Run +### Prepare and Quantize + +To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. ```bash -# obtain the original LLaMA model weights and place them in ./models +# obtain the official LLaMA model weights and place them in ./models ls ./models -65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model +llama-2-7b tokenizer_checklist.chk tokenizer.model # [Optional] for models using BPE tokenizers ls ./models -65B 30B 13B 7B vocab.json + vocab.json +# [Optional] for PyTorch .bin models like Mistral-7B +ls ./models + # install Python dependencies python3 -m pip install -r requirements.txt -# convert the 7B model to ggml FP16 format -python3 convert.py models/7B/ +# convert the model to ggml FP16 format +python3 convert.py models/mymodel/ # [Optional] for models using BPE tokenizers -python convert.py models/7B/ --vocabtype bpe +python convert.py models/mymodel/ --vocabtype bpe -# quantize the model to 4-bits (using q4_0 method) -./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0 +# quantize the model to 4-bits (using Q4_K_M method) +./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M -# update the gguf filetype to current if older version is unsupported by another application -./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY +# update the gguf filetype to current version if older version is now unsupported +./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY +``` +### Run the quantized model -# run the inference -./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 +```bash +# start inference on a gguf model +./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. @@ -709,7 +713,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. -| Model | Original size | Quantized size (4-bit) | +| Model | Original size | Quantized size (Q4_0) | |------:|--------------:|-----------------------:| | 7B | 13 GB | 3.9 GB | | 13B | 24 GB | 7.8 GB | @@ -825,9 +829,9 @@ The `grammars/` folder contains a handful of sample grammars. To write your own, For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. -### Instruction mode with Alpaca +### Instruct mode -1. First, download the `ggml` Alpaca model into the `./models` folder +1. First, download and place the `ggml` model into the `./models` folder 2. Run the `main` tool like this: ``` @@ -853,50 +857,6 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. > ``` -### Using [OpenLLaMA](https://github.com/openlm-research/open_llama) - -OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights. - -- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face. -- Convert the model to ggml FP16 format using `python convert.py ` - -### Using [GPT4All](https://github.com/nomic-ai/gpt4all) - -*Note: these instructions are likely obsoleted by the GGUF update* - -- Obtain the `tokenizer.model` file from LLaMA model and put it to `models` -- Obtain the `added_tokens.json` file from Alpaca model and put it to `models` -- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B` -- It is distributed in the old `ggml` format which is now obsoleted -- You have to convert it to the new format using `convert.py`: - -```bash -python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin -``` - -- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models - -- The newer GPT4All-J model is not yet supported! - -### Using Pygmalion 7B & Metharme 7B - -- Obtain the [LLaMA weights](#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data) -- Obtain the [Pygmalion 7B](https://huggingface.co/PygmalionAI/pygmalion-7b/) or [Metharme 7B](https://huggingface.co/PygmalionAI/metharme-7b) XOR encoded weights -- Convert the LLaMA model with [the latest HF convert script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) -- Merge the XOR files with the converted LLaMA weights by running the [xor_codec](https://huggingface.co/PygmalionAI/pygmalion-7b/blob/main/xor_codec.py) script -- Convert to `ggml` format using the `convert.py` script in this repo: -```bash -python3 convert.py pygmalion-7b/ --outtype q4_1 -``` -> The Pygmalion 7B & Metharme 7B weights are saved in [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) precision. If you wish to convert to `ggml` without quantizating, please specify the `--outtype` as `f32` instead of `f16`. - - -### Obtaining the Facebook LLaMA original model and Stanford Alpaca model data - -- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.** -- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. -- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. - ### Obtaining and using the Facebook LLaMA 2 model - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data. @@ -908,20 +868,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1 - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF) - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF) -### Verifying the model files - -Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. -- The following python script will verify if you have all possible latest files in your self-installed `./models` subdirectory: - -```bash -# run the verification script -./scripts/verify-checksum-models.py -``` - -- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory: - - On Linux: `sha256sum --ignore-missing -c SHA256SUMS` - - on macOS: `shasum -a 256 --ignore-missing -c SHA256SUMS` - ### Seminal papers and background on the models If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: diff --git a/SHA256SUMS b/SHA256SUMS deleted file mode 100644 index ca4d5a4a5..000000000 --- a/SHA256SUMS +++ /dev/null @@ -1,40 +0,0 @@ -700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth -666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin -ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin -7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json -745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth -d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth -2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin -fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin -4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json -e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth -4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth -24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth -1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth -7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin -d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin -2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json -135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth -9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth -e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/consolidated.02.pth -73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e models/65B/consolidated.03.pth -882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225 models/65B/consolidated.04.pth -a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth -72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth -d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth -60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin -cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin -999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json -9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model diff --git a/common/common.cpp b/common/common.cpp index 8c1a60583..e0082a823 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -46,6 +46,10 @@ #define GGML_USE_CUBLAS_SYCL #endif +#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) +#define GGML_USE_CUBLAS_SYCL_VULKAN +#endif + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -660,8 +664,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.tensor_split[i] = 0.0f; } } -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n"); +#ifndef GGML_USE_CUBLAS_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS_SYCL } else if (arg == "--no-mmap") { params.use_mmap = false; diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5e343742d..829d68368 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -22,6 +22,8 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +from convert import HfVocab + # check for any of the given keys in the dictionary and return the value of the first key found def get_key_opts(d, keys): @@ -205,6 +207,8 @@ class Model: return OrionModel if model_architecture == "InternLM2ForCausalLM": return InternLM2Model + if model_architecture == "MiniCPMForCausalLM": + return MiniCPMModel return Model def _is_model_safetensors(self) -> bool: @@ -258,6 +262,8 @@ class Model: return gguf.MODEL_ARCH.ORION if arch == "InternLM2ForCausalLM": return gguf.MODEL_ARCH.INTERNLM2 + if arch == "MiniCPMForCausalLM": + return gguf.MODEL_ARCH.MINICPM raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -402,6 +408,31 @@ class Model: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_hf(self): + path = self.dir_model + added_tokens_path = self.dir_model + vocab = HfVocab( + path, added_tokens_path if added_tokens_path.exists() else None + ) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + class GPTNeoXModel(Model): def set_gguf_parameters(self): @@ -1041,6 +1072,24 @@ class MixtralModel(Model): self._set_vocab_sentencepiece() +class MiniCPMModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + self.gguf_writer.add_name("MiniCPM") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + + def set_vocab(self): + self._set_vocab_hf() + + class QwenModel(Model): @staticmethod def token_bytes_to_string(b): diff --git a/convert.py b/convert.py index 4a2847a27..323e8058d 100755 --- a/convert.py +++ b/convert.py @@ -334,9 +334,9 @@ class Params: class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - try: + if isinstance(self.bpe_tokenizer.get('model'), dict): self.vocab = self.bpe_tokenizer["model"]["vocab"] - except KeyError: + else: self.vocab = self.bpe_tokenizer added_tokens: dict[str, int] if fname_added_tokens is not None: diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 6ac70ba69..031e9806d 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -34,7 +34,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; - std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos); + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); eval_tokens(ctx_llama, embd_inp, n_batch, n_past); return true; } @@ -152,20 +152,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ size_t image_pos = prompt.find(""); if (image_pos != std::string::npos) { // new templating mode: Provide the full prompt including system message and use as a placeholder for the image - system_prompt = prompt.substr(0, image_pos); user_prompt = prompt.substr(image_pos + std::string("").length()); - // We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string - size_t pos = 0; - while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) { - user_prompt.replace(pos, 2, "\n"); - pos += 1; // Advance past the replaced newline - } - while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) { - system_prompt.replace(pos, 2, "\n"); - pos += 1; // Advance past the replaced newline - } - printf("system_prompt: %s\n", system_prompt.c_str()); printf("user_prompt: %s\n", user_prompt.c_str()); } else { diff --git a/examples/server/README.md b/examples/server/README.md index 46d8f85ae..1db7cdf21 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -276,13 +276,15 @@ Notice that each `probs` is an array of length `n_probs`. { "assistant_name": "", "user_name": "", - "default_generation_settings": { ... } + "default_generation_settings": { ... }, + "total_slots": 1 } ``` - `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint. +- `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9481ce6b1..eceda30d0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -432,7 +432,6 @@ struct llama_server_context } default_generation_settings_for_props = get_formated_generation(slots.front()); - default_generation_settings_for_props["num_slots"] = params.n_parallel; default_generation_settings_for_props["seed"] = -1; batch = llama_batch_init(n_ctx, 0, params.n_parallel); @@ -2639,7 +2638,8 @@ int main(int argc, char **argv) json data = { { "user_name", llama.name_user.c_str() }, { "assistant_name", llama.name_assistant.c_str() }, - { "default_generation_settings", llama.default_generation_settings_for_props } + { "default_generation_settings", llama.default_generation_settings_for_props }, + { "total_slots", llama.params.n_parallel } }; res.set_content(data.dump(), "application/json; charset=utf-8"); }); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 14fb89e09..9e2846ee4 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include "ggml.h" #include "ggml-backend-impl.h" @@ -37,6 +38,8 @@ #define GGML_VK_MAX_NODES 8192 +#define MAX_VK_BUFFERS 256 + #ifndef K_QUANTS_PER_ITERATION #define K_QUANTS_PER_ITERATION 1 #else @@ -53,15 +56,68 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA } \ } while (0) -struct vk_buffer { +struct ggml_backend_vk_context; + +struct vk_queue { + uint32_t queue_family_index; + vk::Queue queue; + vk::CommandPool pool; + uint32_t cmd_buffer_idx; + std::vector cmd_buffers; + + vk::PipelineStageFlags stage_flags; +}; + +struct vk_device { + vk::PhysicalDevice physical_device; + vk::PhysicalDeviceProperties properties; + std::string name; + uint64_t max_memory_allocation_size; + bool fp16; + vk::Device device; + uint32_t vendor_id; + vk_queue compute_queue; + vk_queue transfer_queue; + bool single_queue; + uint32_t descriptor_set_mode; + uint32_t subgroup_size; + bool uma; + + ~vk_device() { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "destroy device " << name << std::endl; +#endif + device.destroy(); + } +}; + +struct vk_buffer_struct { vk::Buffer buffer; vk::DeviceMemory device_memory; vk::MemoryPropertyFlags memory_property_flags; void * ptr; size_t size = 0; - uint32_t qf_owner; + + ggml_backend_vk_context * ctx; + + std::shared_ptr device; + + ~vk_buffer_struct() { + if (size == 0) { + return; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << "~vk_buffer_struct(" << buffer << ", " << size << ")" << std::endl; +#endif + + device->device.freeMemory(device_memory); + device->device.destroyBuffer(buffer); + } }; +typedef std::shared_ptr vk_buffer; +typedef std::weak_ptr vk_buffer_ref; + struct vk_subbuffer { vk_buffer buffer; uint64_t offset; @@ -70,6 +126,7 @@ struct vk_subbuffer { struct vk_pipeline { std::string name; + vk::ShaderModule shader_module; vk::DescriptorSetLayout dsl; std::vector descriptor_pools; std::vector descriptor_sets; @@ -82,16 +139,6 @@ struct vk_pipeline { uint32_t align; }; -struct vk_queue { - uint32_t queue_family_index; - vk::Queue queue; - vk::CommandPool pool; - uint32_t cmd_buffer_idx; - std::vector cmd_buffers; - - vk::PipelineStageFlags stage_flags; -}; - struct vk_semaphore { vk::Semaphore s; uint64_t value; @@ -105,20 +152,6 @@ struct vk_submission { typedef std::vector vk_sequence; -struct vk_device { - vk::PhysicalDevice physical_device; - vk::PhysicalDeviceProperties properties; - uint64_t max_memory_allocation_size; - bool fp16; - vk::Device device; - uint32_t vendor_id; - vk_queue compute_queue; - vk_queue transfer_queue; - uint32_t descriptor_set_mode; - uint32_t subgroup_size; - bool uma; -}; - struct vk_op_push_constants { uint32_t KX; uint32_t KY; @@ -190,13 +223,13 @@ struct ggml_tensor_extra_gpu { size_t ctx_idx; - vk_buffer buffer_gpu; + vk_buffer_ref buffer_gpu; uint64_t offset; void reset() { ready = false; ctx_idx = 0; - buffer_gpu.size = 0; + buffer_gpu.reset(); offset = 0; } }; @@ -210,69 +243,96 @@ struct ggml_vk_garbage_collector { std::vector contexts; }; -typedef void (*ggml_vk_func_t)(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); +struct ggml_backend_vk_context { + std::string name; -vk::Instance vk_instance; -vk_device vk_device; -vk_pipeline vk_pipeline_matmul_f32_l, vk_pipeline_matmul_f32_m, vk_pipeline_matmul_f32_s; -vk_pipeline vk_pipeline_matmul_f32_aligned_l, vk_pipeline_matmul_f32_aligned_m, vk_pipeline_matmul_f32_aligned_s; -vk_pipeline vk_pipeline_matmul_f16_l, vk_pipeline_matmul_f16_m, vk_pipeline_matmul_f16_s; -vk_pipeline vk_pipeline_matmul_f16_aligned_l, vk_pipeline_matmul_f16_aligned_m, vk_pipeline_matmul_f16_aligned_s; -vk_pipeline vk_pipeline_matmul_f16_f32_l, vk_pipeline_matmul_f16_f32_m, vk_pipeline_matmul_f16_f32_s; -vk_pipeline vk_pipeline_matmul_f16_f32_aligned_l, vk_pipeline_matmul_f16_f32_aligned_m, vk_pipeline_matmul_f16_f32_aligned_s; -vk_pipeline vk_pipeline_matmul_split_k_reduce; -vk_pipeline vk_pipeline_dequant[VK_NUM_TYPES]; -vk_pipeline vk_pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES]; -vk_pipeline vk_pipeline_mul_mat_vec_p021_f16_f32; -vk_pipeline vk_pipeline_mul_mat_vec_nc_f16_f32; -vk_pipeline vk_pipeline_get_rows[VK_NUM_TYPES]; -vk_pipeline vk_pipeline_get_rows_f32[VK_NUM_TYPES]; -vk_pipeline vk_pipeline_mul_f32; -vk_pipeline vk_pipeline_add_f32; -vk_pipeline vk_pipeline_scale_f32; -vk_pipeline vk_pipeline_sqr_f32; -vk_pipeline vk_pipeline_clamp_f32; -vk_pipeline vk_pipeline_cpy_f32_f32, vk_pipeline_cpy_f32_f16, vk_pipeline_cpy_f16_f16; -vk_pipeline vk_pipeline_norm_f32; -vk_pipeline vk_pipeline_rms_norm_f32; -vk_pipeline vk_pipeline_gelu_f32; -vk_pipeline vk_pipeline_silu_f32; -vk_pipeline vk_pipeline_relu_f32; -vk_pipeline vk_pipeline_diag_mask_inf_f32; -vk_pipeline vk_pipeline_soft_max_f32; -vk_pipeline vk_pipeline_rope_f32, vk_pipeline_rope_f16; -vk_pipeline vk_pipeline_rope_neox_f32, vk_pipeline_rope_neox_f16; + std::weak_ptr device; + vk_pipeline pipeline_matmul_f32_l, pipeline_matmul_f32_m, pipeline_matmul_f32_s; + vk_pipeline pipeline_matmul_f32_aligned_l, pipeline_matmul_f32_aligned_m, pipeline_matmul_f32_aligned_s; + vk_pipeline pipeline_matmul_f16_l, pipeline_matmul_f16_m, pipeline_matmul_f16_s; + vk_pipeline pipeline_matmul_f16_aligned_l, pipeline_matmul_f16_aligned_m, pipeline_matmul_f16_aligned_s; + vk_pipeline pipeline_matmul_f16_f32_l, pipeline_matmul_f16_f32_m, pipeline_matmul_f16_f32_s; + vk_pipeline pipeline_matmul_f16_f32_aligned_l, pipeline_matmul_f16_f32_aligned_m, pipeline_matmul_f16_f32_aligned_s; + vk_pipeline pipeline_matmul_split_k_reduce; + vk_pipeline pipeline_dequant[VK_NUM_TYPES]; + vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES]; + vk_pipeline pipeline_mul_mat_vec_p021_f16_f32; + vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; + vk_pipeline pipeline_get_rows[VK_NUM_TYPES]; + vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES]; + vk_pipeline pipeline_mul_f32; + vk_pipeline pipeline_add_f32; + vk_pipeline pipeline_scale_f32; + vk_pipeline pipeline_sqr_f32; + vk_pipeline pipeline_clamp_f32; + vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; + vk_pipeline pipeline_norm_f32; + vk_pipeline pipeline_rms_norm_f32; + vk_pipeline pipeline_gelu_f32; + vk_pipeline pipeline_silu_f32; + vk_pipeline pipeline_relu_f32; + vk_pipeline pipeline_diag_mask_inf_f32; + vk_pipeline pipeline_soft_max_f32; + vk_pipeline pipeline_rope_f32, pipeline_rope_f16; + vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; -static size_t vk_semaphore_idx, vk_event_idx; -static ggml_vk_garbage_collector vk_gc; -static std::vector> vk_pinned_memory; -static size_t vk_prealloc_size_qx, vk_prealloc_size_qy, vk_prealloc_size_x, vk_prealloc_size_y, vk_prealloc_size_split_k; -static vk_buffer vk_prealloc_qx, vk_prealloc_qy, vk_prealloc_x, vk_prealloc_y, vk_prealloc_split_k; -static vk::Fence vk_fence; -static vk_buffer vk_staging; -static size_t vk_staging_size; -static size_t vk_staging_offset; -static vk_buffer vk_sync_staging; + size_t semaphore_idx, event_idx; + ggml_vk_garbage_collector gc; + std::vector> pinned_memory; + size_t prealloc_size_qx, prealloc_size_qy, prealloc_size_x, prealloc_size_y, prealloc_size_split_k; + vk_buffer prealloc_qx, prealloc_qy, prealloc_x, prealloc_y, prealloc_split_k; + vk::Fence fence; + vk_buffer staging; + size_t staging_size; + size_t staging_offset; + vk_buffer sync_staging; -static vk_context * vk_ctx; -static vk_context * vk_transfer_ctx; + vk_buffer buffer_pool[MAX_VK_BUFFERS]; -static bool vk_disable; + vk_context * compute_ctx; + vk_context * transfer_ctx; + + bool disable; + bool initialized; + + size_t idx; +}; + +struct vk_instance { + vk::Instance instance; + + std::vector device_indices; + + std::shared_ptr devices[GGML_VK_MAX_DEVICES]; + ggml_backend_t backends[GGML_VK_MAX_DEVICES]; + ggml_backend_vk_context contexts[GGML_VK_MAX_DEVICES]; + ggml_backend_buffer_type buffer_types[GGML_VK_MAX_DEVICES]; + bool initialized[GGML_VK_MAX_DEVICES]; +}; #ifdef GGML_VULKAN_CHECK_RESULTS -size_t vk_skip_checks; -size_t vk_output_tensor; +static size_t vk_skip_checks; +static size_t vk_output_tensor; + +static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name); +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor); +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor); #endif -static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { +typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +static bool vk_instance_initialized = false; +static vk_instance vk_instance; + +GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend); + +static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, std::vector&& specialization_constants, uint32_t align) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_pipeline(" << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")" << std::endl; #endif GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT - vk_pipeline pipeline; - pipeline.name = name; pipeline.parameter_count = parameter_count; pipeline.push_constant_size = push_constant_size; @@ -280,7 +340,7 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s pipeline.align = align; vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast(spv_data)); - vk::ShaderModule shader_module = vk_device.device.createShaderModule(shader_module_create_info); + pipeline.shader_module = ctx->device.lock()->device.createShaderModule(shader_module_create_info); std::vector dsl_binding; std::vector dsl_binding_flags; @@ -301,17 +361,17 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s {}, dsl_binding); descriptor_set_layout_create_info.setPNext(&dslbfci); - pipeline.dsl = vk_device.device.createDescriptorSetLayout(descriptor_set_layout_create_info); + pipeline.dsl = ctx->device.lock()->device.createDescriptorSetLayout(descriptor_set_layout_create_info); // Check if device supports multiple descriptors per pool - if (vk_device.descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) { + if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) { const uint32_t alloc_count = 2; // Try allocating multiple sets from one pool // This fails on AMD for some reason, so add a fall back to allocating one pool per set vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size); - vk::DescriptorPool pool = vk_device.device.createDescriptorPool(descriptor_pool_create_info); + vk::DescriptorPool pool = ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info); std::vector layouts(alloc_count); for (uint32_t i = 0; i < alloc_count; i++) { @@ -319,24 +379,24 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s } try { vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data()); - std::vector sets = vk_device.device.allocateDescriptorSets(descriptor_set_alloc_info); + std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); } catch(vk::OutOfPoolMemoryError const&) { - vk_device.descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE; + ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE; } - vk_device.device.destroyDescriptorPool(pool); + ctx->device.lock()->device.destroyDescriptorPool(pool); } - if (vk_device.descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { + if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size); - pipeline.descriptor_pools.push_back(vk_device.device.createDescriptorPool(descriptor_pool_create_info)); + pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info)); } pipeline.descriptor_set_idx = 0; vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline.dsl, pcr); - pipeline.layout = vk_device.device.createPipelineLayout(pipeline_layout_create_info); + pipeline.layout = ctx->device.lock()->device.createPipelineLayout(pipeline_layout_create_info); std::vector specialization_entries(specialization_constants.size()); @@ -356,41 +416,45 @@ static vk_pipeline ggml_vk_create_pipeline(const std::string& name, size_t spv_s vk::PipelineShaderStageCreateInfo pipeline_shader_create_info( vk::PipelineShaderStageCreateFlags(), vk::ShaderStageFlagBits::eCompute, - shader_module, + pipeline.shader_module, entrypoint.c_str(), &specialization_info); vk::ComputePipelineCreateInfo compute_pipeline_create_info( vk::PipelineCreateFlags(), pipeline_shader_create_info, pipeline.layout); - pipeline.pipeline = vk_device.device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; + pipeline.pipeline = ctx->device.lock()->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; - return pipeline; + ctx->gc.pipelines.push_back(&pipeline); } -static void ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline& pipeline, uint32_t n) { +static void ggml_vk_destroy_pipeline(ggml_backend_vk_context * ctx, vk_pipeline * pipeline) { + for (auto& pool : pipeline->descriptor_pools) { + ctx->device.lock()->device.destroyDescriptorPool(pool); + } + pipeline->descriptor_pools.clear(); + pipeline->descriptor_sets.clear(); + pipeline->descriptor_set_idx = 0; + + ctx->device.lock()->device.destroyDescriptorSetLayout(pipeline->dsl); + + ctx->device.lock()->device.destroyPipelineLayout(pipeline->layout); + + ctx->device.lock()->device.destroyShaderModule(pipeline->shader_module); + + ctx->device.lock()->device.destroyPipeline(pipeline->pipeline); +} + +static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_pipeline_allocate_descriptor_sets(" << pipeline.name << ", " << n << ")" << std::endl; + std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline.name << ", " << n << ")" << std::endl; #endif - // Check if gc already contains pipeline before adding it - bool gc_found = false; - for (auto * pl : vk_gc.pipelines) { - if (&pipeline == pl) { - gc_found = true; - break; - } - } - - if (!gc_found) { - vk_gc.pipelines.push_back(&pipeline); - } - if (pipeline.descriptor_sets.size() >= pipeline.descriptor_set_idx + n) { // Enough descriptors are available return; } - if (vk_device.descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { + if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { const uint32_t alloc_count = pipeline.descriptor_set_idx + n - pipeline.descriptor_sets.size(); std::vector layouts(alloc_count); @@ -398,29 +462,29 @@ static void ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline& pipeline, uin layouts[i] = pipeline.dsl; } vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[0], alloc_count, layouts.data()); - std::vector sets = vk_device.device.allocateDescriptorSets(descriptor_set_alloc_info); + std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); pipeline.descriptor_sets.insert(pipeline.descriptor_sets.end(), sets.begin(), sets.end()); } else { for (uint32_t i = pipeline.descriptor_sets.size(); i < pipeline.descriptor_set_idx + n; i++) { vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size); - pipeline.descriptor_pools.push_back(vk_device.device.createDescriptorPool(descriptor_pool_create_info)); + pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info)); vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[i], 1, &pipeline.dsl); - std::vector sets = vk_device.device.allocateDescriptorSets(descriptor_set_alloc_info); + std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); pipeline.descriptor_sets.push_back(sets[0]); } } } -static void ggml_vk_pipeline_cleanup(vk_pipeline& pipeline) { +static void ggml_pipeline_cleanup(vk_pipeline& pipeline) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_pipeline_cleanup(" << pipeline.name << ")" << std::endl; + std::cerr << "ggml_pipeline_cleanup(" << pipeline.name << ")" << std::endl; #endif pipeline.descriptor_set_idx = 0; } -static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { +static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_cmd_buffer()" << std::endl; #endif @@ -433,7 +497,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { q.pool, vk::CommandBufferLevel::ePrimary, 1); - const std::vector cmd_buffers = vk_device.device.allocateCommandBuffers(command_buffer_alloc_info); + const std::vector cmd_buffers = ctx->device.lock()->device.allocateCommandBuffers(command_buffer_alloc_info); auto buf = cmd_buffers.front(); q.cmd_buffers.push_back(buf); @@ -442,24 +506,17 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_queue& q) { return buf; } -static vk_submission ggml_vk_create_submission(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { +static vk_submission ggml_vk_create_submission(ggml_backend_vk_context * ctx, vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_submission()" << std::endl; #endif vk_submission s; - s.buffer = ggml_vk_create_cmd_buffer(q); + s.buffer = ggml_vk_create_cmd_buffer(ctx, q); s.wait_semaphores = std::move(wait_semaphores); s.signal_semaphores = std::move(signal_semaphores); return s; } -static vk_sequence ggml_vk_create_sequence_1(vk_queue& q, std::vector wait_semaphores, std::vector signal_semaphores) { -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_sequence_1()" << std::endl; -#endif - return { ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores)) }; -} - static void ggml_vk_submit(vk_context * ctx, vk::Fence fence) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_submit(" << ctx->seqs.size() << ", " << fence << ")" << std::endl; @@ -578,89 +635,89 @@ static uint32_t ggml_vk_find_queue_family_index(std::vectordevice.lock()->device.createCommandPool(command_pool_create_info_compute); q.cmd_buffer_idx = 0; - q.queue = vk_device.device.getQueue(queue_family_index, queue_index); + q.queue = ctx->device.lock()->device.getQueue(queue_family_index, queue_index); q.stage_flags = stage_flags; - - return q; } -static vk_context * ggml_vk_create_context(vk_queue& q) { +static vk_context * ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_queue& q) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_context()" << std::endl; #endif - vk_gc.contexts.emplace_back(); - vk_context * result = &vk_gc.contexts[vk_gc.contexts.size() - 1]; + ctx->gc.contexts.emplace_back(); + vk_context * result = &ctx->gc.contexts[ctx->gc.contexts.size() - 1]; memset((void *) result, 0, sizeof(vk_context)); - result->idx = vk_gc.contexts.size() - 1; + result->idx = ctx->gc.contexts.size() - 1; result->q = &q; return result; } -static vk_semaphore * ggml_vk_create_binary_semaphore() { +static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * ctx) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl; #endif vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); - vk::Semaphore semaphore = vk_device.device.createSemaphore(ci); - vk_gc.semaphores.push_back({ semaphore, 0 }); - return &vk_gc.semaphores[vk_gc.semaphores.size() - 1]; + vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci); + ctx->gc.semaphores.push_back({ semaphore, 0 }); + return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1]; } -static vk_semaphore * ggml_vk_create_timeline_semaphore() { +static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context * ctx) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_timeline_semaphore()" << std::endl; #endif - if (vk_semaphore_idx >= vk_gc.tl_semaphores.size()) { + if (ctx->semaphore_idx >= ctx->gc.tl_semaphores.size()) { vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); - vk::Semaphore semaphore = vk_device.device.createSemaphore(ci); - vk_gc.tl_semaphores.push_back({ semaphore, 0 }); + vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci); + ctx->gc.tl_semaphores.push_back({ semaphore, 0 }); } - return &vk_gc.tl_semaphores[vk_semaphore_idx++]; + return &ctx->gc.tl_semaphores[ctx->semaphore_idx++]; } -static vk::Event ggml_vk_create_event() { - if (vk_event_idx >= vk_gc.events.size()) { - vk_gc.events.push_back(vk_device.device.createEvent({})); +static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { + if (ctx->event_idx >= ctx->gc.events.size()) { + ctx->gc.events.push_back(ctx->device.lock()->device.createEvent({})); } - return vk_gc.events[vk_event_idx++]; + return ctx->gc.events[ctx->event_idx++]; } -static void ggml_vk_queue_cleanup(vk_queue& q) { +static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_queue_cleanup()" << std::endl; #endif // Requires command buffers to be done - vk_device.device.resetCommandPool(q.pool); + ctx->device.lock()->device.resetCommandPool(q.pool); q.cmd_buffer_idx = 0; } -static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_flags) { +static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl; #endif - GGML_ASSERT(size > 0); + vk_buffer buf = std::make_shared(); - vk_buffer buf; + if (size == 0) { + buf->size = 0; + return buf; + } - buf.size = size; + buf->size = size; vk::BufferCreateInfo buffer_create_info{ vk::BufferCreateFlags(), size, @@ -670,11 +727,11 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_ nullptr, }; - buf.buffer = vk_device.device.createBuffer(buffer_create_info); + buf->buffer = ctx->device.lock()->device.createBuffer(buffer_create_info); - vk::MemoryRequirements mem_req = vk_device.device.getBufferMemoryRequirements(buf.buffer); + vk::MemoryRequirements mem_req = ctx->device.lock()->device.getBufferMemoryRequirements(buf->buffer); - vk::PhysicalDeviceMemoryProperties mem_props = vk_device.physical_device.getMemoryProperties(); + vk::PhysicalDeviceMemoryProperties mem_props = ctx->device.lock()->physical_device.getMemoryProperties(); uint32_t memory_type_index = UINT32_MAX; @@ -691,30 +748,36 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_ } try { - buf.device_memory = vk_device.device.allocateMemory({ mem_req.size, memory_type_index }); + buf->device_memory = ctx->device.lock()->device.allocateMemory({ mem_req.size, memory_type_index }); } catch (const vk::SystemError& e) { // Out of Host/Device memory, clean up buffer - vk_device.device.destroyBuffer(buf.buffer); - buf.size = 0; + ctx->device.lock()->device.destroyBuffer(buf->buffer); + buf->size = 0; throw e; } - buf.memory_property_flags = req_flags; - buf.ptr = nullptr; + buf->memory_property_flags = req_flags; + buf->ptr = nullptr; if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - buf.ptr = vk_device.device.mapMemory(buf.device_memory, 0, VK_WHOLE_SIZE); + buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); } - vk_device.device.bindBufferMemory(buf.buffer, buf.device_memory, 0); + ctx->device.lock()->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); - buf.qf_owner = VK_QUEUE_FAMILY_IGNORED; + buf->ctx = ctx; + + buf->device = ctx->device.lock(); + +#ifdef GGML_VULKAN_DEBUG + std::cerr << "Created buffer " << buf->buffer << std::endl; +#endif return buf; } -static vk_buffer ggml_vk_create_buffer_check(size_t size, vk::MemoryPropertyFlags req_flags) { +static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { try { - return ggml_vk_create_buffer(size, req_flags); + return ggml_vk_create_buffer(ctx, size, req_flags); } catch (const vk::SystemError& e) { std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; @@ -722,14 +785,14 @@ static vk_buffer ggml_vk_create_buffer_check(size_t size, vk::MemoryPropertyFlag } } -static vk_buffer ggml_vk_create_buffer_device(size_t size) { +static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) { vk_buffer buf; try { - buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal); + buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); } catch (const vk::SystemError& e) { - if (vk_device.uma) { + if (ctx->device.lock()->uma) { // Fall back to host memory type - buf = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); + buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } else { std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; @@ -741,16 +804,7 @@ static vk_buffer ggml_vk_create_buffer_device(size_t size) { } static void ggml_vk_destroy_buffer(vk_buffer& buf) { - if (buf.size == 0) { - return; - } -#ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl; -#endif - - buf.size = 0; - vk_device.device.freeMemory(buf.device_memory); - vk_device.device.destroyBuffer(buf.buffer); + buf.reset(); } static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { @@ -773,7 +827,7 @@ static void ggml_vk_sync_buffers(vk_context * ctx) { ); } -static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vector&& events, vk::PipelineStageFlags src_stages, vk::PipelineStageFlags dst_stages) { +static void ggml_vk_wait_events(vk_context * ctx, std::vector&& events) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_wait_events()" << std::endl; #endif @@ -781,10 +835,10 @@ static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vectors->buffer.waitEvents( events, - src_stages, - dst_stages, + ctx->q->stage_flags, + ctx->q->stage_flags, {}, {}, {} @@ -810,15 +864,15 @@ static bool ggml_vk_build_shader(ggml_type type) { } } -static void ggml_vk_load_shaders() { +static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_load_shaders()" << std::endl; + std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl; #endif // mulmat - std::initializer_list warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size }; - std::initializer_list warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size }; - std::initializer_list warptile_s = { vk_device.subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, vk_device.subgroup_size }; + std::initializer_list warptile_l = { 128, 128, 128, 16, ctx->device.lock()->subgroup_size * 2, 64, 2, 4, 4, ctx->device.lock()->subgroup_size }; + std::initializer_list warptile_m = { 128, 64, 64, 16, ctx->device.lock()->subgroup_size, 32, 2, 4, 2, ctx->device.lock()->subgroup_size }; + std::initializer_list warptile_s = { ctx->device.lock()->subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, ctx->device.lock()->subgroup_size }; std::array l_wg_denoms = {128, 128, 1 }; std::array m_wg_denoms = { 64, 64, 1 }; @@ -828,145 +882,208 @@ static void ggml_vk_load_shaders() { uint32_t m_align = 64; uint32_t s_align = 32; - if (vk_device.fp16) { - vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline("matmul_f32_l", matmul_f32_l_len, matmul_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline("matmul_f32_m", matmul_f32_m_len, matmul_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f32_s = ggml_vk_create_pipeline("matmul_f32_s", matmul_f32_s_len, matmul_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - vk_pipeline_matmul_f32_aligned_l = ggml_vk_create_pipeline("matmul_f32_aligned_l", matmul_f32_aligned_l_len, matmul_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline("matmul_f32_aligned_m", matmul_f32_aligned_m_len, matmul_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f32_aligned_s = ggml_vk_create_pipeline("matmul_f32_aligned_s", matmul_f32_aligned_s_len, matmul_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + if (ctx->device.lock()->fp16) { + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_l_len, matmul_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_m_len, matmul_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_s_len, matmul_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_l_len, matmul_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_m_len, matmul_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_s_len, matmul_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - vk_pipeline_matmul_f16_l = ggml_vk_create_pipeline("matmul_f16_l", matmul_f16_l_len, matmul_f16_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline("matmul_f16_m", matmul_f16_m_len, matmul_f16_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f16_s = ggml_vk_create_pipeline("matmul_f16_s", matmul_f16_s_len, matmul_f16_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_l_len, matmul_f16_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_m_len, matmul_f16_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_s_len, matmul_f16_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_l_len, matmul_f16_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_m_len, matmul_f16_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_s_len, matmul_f16_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - vk_pipeline_matmul_f16_aligned_l = ggml_vk_create_pipeline("matmul_f16_aligned_l", matmul_f16_aligned_l_len, matmul_f16_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline("matmul_f16_aligned_m", matmul_f16_aligned_m_len, matmul_f16_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f16_aligned_s = ggml_vk_create_pipeline("matmul_f16_aligned_s", matmul_f16_aligned_s_len, matmul_f16_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - - vk_pipeline_matmul_f16_f32_l = ggml_vk_create_pipeline("matmul_f16_f32_l", matmul_f16_f32_l_len, matmul_f16_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline("matmul_f16_f32_m", matmul_f16_f32_m_len, matmul_f16_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f16_f32_s = ggml_vk_create_pipeline("matmul_f16_f32_s", matmul_f16_f32_s_len, matmul_f16_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_len, matmul_f16_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_len, matmul_f16_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_len, matmul_f16_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_l_len, matmul_f16_f32_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_m_len, matmul_f16_f32_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_s_len, matmul_f16_f32_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_len, matmul_f16_f32_aligned_l_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_len, matmul_f16_f32_aligned_m_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_len, matmul_f16_f32_aligned_s_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); } else { - vk_pipeline_matmul_f32_l = ggml_vk_create_pipeline("matmul_f32_l", matmul_f32_l_fp32_len, matmul_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f32_m = ggml_vk_create_pipeline("matmul_f32_m", matmul_f32_m_fp32_len, matmul_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f32_s = ggml_vk_create_pipeline("matmul_f32_s", matmul_f32_s_fp32_len, matmul_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - vk_pipeline_matmul_f32_aligned_l = ggml_vk_create_pipeline("matmul_f32_aligned_l", matmul_f32_aligned_l_fp32_len, matmul_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f32_aligned_m = ggml_vk_create_pipeline("matmul_f32_aligned_m", matmul_f32_aligned_m_fp32_len, matmul_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f32_aligned_s = ggml_vk_create_pipeline("matmul_f32_aligned_s", matmul_f32_aligned_s_fp32_len, matmul_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_l_fp32_len, matmul_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_m_fp32_len, matmul_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_s_fp32_len, matmul_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_l_fp32_len, matmul_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_m_fp32_len, matmul_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_s_fp32_len, matmul_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - vk_pipeline_matmul_f16_l = ggml_vk_create_pipeline("matmul_f16_l", matmul_f16_l_fp32_len, matmul_f16_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f16_m = ggml_vk_create_pipeline("matmul_f16_m", matmul_f16_m_fp32_len, matmul_f16_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f16_s = ggml_vk_create_pipeline("matmul_f16_s", matmul_f16_s_fp32_len, matmul_f16_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_l_fp32_len, matmul_f16_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_m_fp32_len, matmul_f16_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_s_fp32_len, matmul_f16_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_l_fp32_len, matmul_f16_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_m_fp32_len, matmul_f16_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_s_fp32_len, matmul_f16_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - vk_pipeline_matmul_f16_aligned_l = ggml_vk_create_pipeline("matmul_f16_aligned_l", matmul_f16_aligned_l_fp32_len, matmul_f16_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f16_aligned_m = ggml_vk_create_pipeline("matmul_f16_aligned_m", matmul_f16_aligned_m_fp32_len, matmul_f16_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f16_aligned_s = ggml_vk_create_pipeline("matmul_f16_aligned_s", matmul_f16_aligned_s_fp32_len, matmul_f16_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - - vk_pipeline_matmul_f16_f32_l = ggml_vk_create_pipeline("matmul_f16_f32_l", matmul_f16_f32_l_fp32_len, matmul_f16_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - vk_pipeline_matmul_f16_f32_m = ggml_vk_create_pipeline("matmul_f16_f32_m", matmul_f16_f32_m_fp32_len, matmul_f16_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - vk_pipeline_matmul_f16_f32_s = ggml_vk_create_pipeline("matmul_f16_f32_s", matmul_f16_f32_s_fp32_len, matmul_f16_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - vk_pipeline_matmul_f16_f32_aligned_l = ggml_vk_create_pipeline("matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_fp32_len, matmul_f16_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - vk_pipeline_matmul_f16_f32_aligned_m = ggml_vk_create_pipeline("matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_fp32_len, matmul_f16_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - vk_pipeline_matmul_f16_f32_aligned_s = ggml_vk_create_pipeline("matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_fp32_len, matmul_f16_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_l_fp32_len, matmul_f16_f32_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_m_fp32_len, matmul_f16_f32_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_s_fp32_len, matmul_f16_f32_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_l_fp32_len, matmul_f16_f32_aligned_l_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_m_fp32_len, matmul_f16_f32_aligned_m_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_s_fp32_len, matmul_f16_f32_aligned_s_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); } - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); - vk_pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(int), {1, 1, 1}, {}, 1); // dequant shaders - vk_pipeline_dequant[GGML_TYPE_F32] = ggml_vk_create_pipeline("f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 4 * sizeof(int), {64, 1, 1}, {}, 1); - - vk_pipeline_dequant[GGML_TYPE_F16] = ggml_vk_create_pipeline("dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q2_K] = ggml_vk_create_pipeline("dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q3_K] = ggml_vk_create_pipeline("dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q4_K] = ggml_vk_create_pipeline("dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q5_K] = ggml_vk_create_pipeline("dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); - vk_pipeline_dequant[GGML_TYPE_Q6_K] = ggml_vk_create_pipeline("dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 4 * sizeof(int), { 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F16 ], "dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 4 * sizeof(int), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 4 * sizeof(int), {256 * 64, 1, 1}, {}, 1); // get_rows - vk_pipeline_get_rows[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_F16] = ggml_vk_create_pipeline("get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q4_0] = ggml_vk_create_pipeline("get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q4_1] = ggml_vk_create_pipeline("get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q5_0] = ggml_vk_create_pipeline("get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q5_1] = ggml_vk_create_pipeline("get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_get_rows_f32[GGML_TYPE_Q8_0] = ggml_vk_create_pipeline("get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_matmul_split_k_reduce = ggml_vk_create_pipeline("split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); - vk_pipeline_mul_mat_vec_p021_f16_f32 = ggml_vk_create_pipeline("mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); - vk_pipeline_mul_mat_vec_nc_f16_f32 = ggml_vk_create_pipeline("mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); - vk_pipeline_norm_f32 = ggml_vk_create_pipeline("norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - vk_pipeline_rms_norm_f32 = ggml_vk_create_pipeline("rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - vk_pipeline_cpy_f32_f32 = ggml_vk_create_pipeline("cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_cpy_f32_f16 = ggml_vk_create_pipeline("cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_cpy_f16_f16 = ggml_vk_create_pipeline("cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_add_f32 = ggml_vk_create_pipeline("add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_mul_f32 = ggml_vk_create_pipeline("mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_scale_f32 = ggml_vk_create_pipeline("scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_sqr_f32 = ggml_vk_create_pipeline("sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_clamp_f32 = ggml_vk_create_pipeline("clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_gelu_f32 = ggml_vk_create_pipeline("gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_silu_f32 = ggml_vk_create_pipeline("silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_relu_f32 = ggml_vk_create_pipeline("relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_diag_mask_inf_f32 = ggml_vk_create_pipeline("diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); - vk_pipeline_soft_max_f32 = ggml_vk_create_pipeline("soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - vk_pipeline_rope_f32 = ggml_vk_create_pipeline("rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - vk_pipeline_rope_f16 = ggml_vk_create_pipeline("rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - vk_pipeline_rope_neox_f32 = ggml_vk_create_pipeline("rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); - vk_pipeline_rope_neox_f16 = ggml_vk_create_pipeline("rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); } -void ggml_vk_init() { +static void ggml_vk_print_gpu_info(size_t idx) { + GGML_ASSERT(idx < vk_instance.device_indices.size()); + size_t dev_num = vk_instance.device_indices[idx]; #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_init()" << std::endl; + std::cerr << "ggml_vk_print_gpu_info(" << dev_num << ")" << std::endl; #endif - static bool initialized = false; + GGML_ASSERT(vk_instance.initialized); - if (initialized) { - return; + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + if (dev_num >= devices.size()) { + std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; + throw std::runtime_error("Device not found"); } - initialized = true; + vk::PhysicalDevice physical_device = devices[dev_num]; + std::vector ext_props = physical_device.enumerateDeviceExtensionProperties(); - const char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE"); - int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE)); + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceMaintenance3Properties props3; + vk::PhysicalDeviceSubgroupProperties subgroup_props; + props2.pNext = &props3; + props3.pNext = &subgroup_props; + physical_device.getProperties2(&props2); + + const size_t subgroup_size = subgroup_props.subgroupSize; + const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + + bool fp16_storage = false; + bool fp16_compute = false; + + for (auto properties : ext_props) { + if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { + fp16_storage = true; + } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { + fp16_compute = true; + } + } + + const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16"); + bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr; + + bool fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + + vk::PhysicalDeviceFeatures device_features = physical_device.getFeatures(); + + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + device_features2.pNext = nullptr; + device_features2.features = (VkPhysicalDeviceFeatures)device_features; + + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; + + VkPhysicalDeviceVulkan12Features vk12_features; + vk12_features.pNext = nullptr; + vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; + vk11_features.pNext = &vk12_features; + + vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); + + fp16 = fp16 && vk12_features.shaderFloat16; + + std::string device_name = props2.properties.deviceName.data(); + std::cerr << GGML_VK_NAME << idx << ": " << device_name << " | uma: " << uma << " | fp16: " << fp16 << " | warp size: " << subgroup_size << std::endl; + + if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { + std::cerr << "ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want." << std::endl; + } +} + +void ggml_vk_instance_init() { + if (vk_instance_initialized) { + return; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_instance_init()" << std::endl; +#endif vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION }; const std::vector layers = { @@ -989,12 +1106,55 @@ void ggml_vk_init() { validation_features.setPNext(nullptr); instance_create_info.setPNext(&validation_features); -std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; + std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; #endif - vk_instance = vk::createInstance(instance_create_info); + vk_instance.instance = vk::createInstance(instance_create_info); - vk_device.physical_device = vk_instance.enumeratePhysicalDevices()[dev_num]; - std::vector ext_props = vk_device.physical_device.enumerateDeviceExtensionProperties(); + memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES); + + size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size(); + + // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan + char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES"); + if (devices_env != nullptr) { + std::string devices(devices_env); + std::replace(devices.begin(), devices.end(), ',', ' '); + + std::stringstream ss(devices); + size_t tmp; + while (ss >> tmp) { + if(tmp >= num_available_devices) { + std::cerr << "ggml_vulkan: Invalid device index " << tmp << " in GGML_VK_VISIBLE_DEVICES." << std::endl; + throw std::runtime_error("Invalid Vulkan device index"); + } + vk_instance.device_indices.push_back(tmp); + } + } else { + vk_instance.device_indices.push_back(0); + } + + vk_instance_initialized = true; +} + +void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { + GGML_ASSERT(idx < vk_instance.device_indices.size()); + size_t dev_num = vk_instance.device_indices[idx]; +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_init(" << ctx->name << ", " << dev_num << ")" << std::endl; +#endif + ggml_vk_instance_init(); + + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + if (dev_num >= devices.size()) { + std::cerr << "ggml_vulkan: Device with index " << dev_num << " does not exist." << std::endl; + throw std::runtime_error("Device not found"); + } + + vk_instance.devices[idx] = std::make_shared(); + ctx->device = vk_instance.devices[idx]; + ctx->device.lock()->physical_device = devices[dev_num]; + std::vector ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties(); bool maintenance4_support = false; @@ -1014,18 +1174,18 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; if (maintenance4_support) { subgroup_props.pNext = &props4; } - vk_device.physical_device.getProperties2(&props2); - vk_device.properties = props2.properties; + ctx->device.lock()->physical_device.getProperties2(&props2); + ctx->device.lock()->properties = props2.properties; if (maintenance4_support) { - vk_device.max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); + ctx->device.lock()->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); } else { - vk_device.max_memory_allocation_size = props3.maxMemoryAllocationSize; + ctx->device.lock()->max_memory_allocation_size = props3.maxMemoryAllocationSize; } - vk_device.vendor_id = vk_device.properties.vendorID; - vk_device.subgroup_size = subgroup_props.subgroupSize; - vk_device.uma = vk_device.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; + ctx->device.lock()->vendor_id = ctx->device.lock()->properties.vendorID; + ctx->device.lock()->subgroup_size = subgroup_props.subgroupSize; + ctx->device.lock()->uma = ctx->device.lock()->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; bool fp16_storage = false; bool fp16_compute = false; @@ -1039,31 +1199,31 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; } const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16"); - bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != NULL; + bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr; - vk_device.fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + ctx->device.lock()->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; - std::vector queue_family_props = vk_device.physical_device.getQueueFamilyProperties(); + std::vector queue_family_props = ctx->device.lock()->physical_device.getQueueFamilyProperties(); // Try to find a non-graphics compute queue and transfer-focused queues const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1); const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1); const float priorities[] = { 1.0f, 1.0f }; - const bool single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; + ctx->device.lock()->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; std::vector device_queue_create_infos; if (compute_queue_family_index != transfer_queue_family_index) { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1}); - } else if(!single_queue) { + } else if(!ctx->device.lock()->single_queue) { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); } else { device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); } vk::DeviceCreateInfo device_create_info; std::vector device_extensions; - vk::PhysicalDeviceFeatures device_features = vk_device.physical_device.getFeatures(); + vk::PhysicalDeviceFeatures device_features = ctx->device.lock()->physical_device.getFeatures(); VkPhysicalDeviceFeatures2 device_features2; device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; @@ -1080,13 +1240,13 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; - vkGetPhysicalDeviceFeatures2(vk_device.physical_device, &device_features2); + vkGetPhysicalDeviceFeatures2(ctx->device.lock()->physical_device, &device_features2); - vk_device.fp16 = vk_device.fp16 && vk12_features.shaderFloat16; + ctx->device.lock()->fp16 = ctx->device.lock()->fp16 && vk12_features.shaderFloat16; if (!vk11_features.storageBuffer16BitAccess) { - std::cerr << "ggml_vulkan: device does not support 16-bit storage" << std::endl; - GGML_ASSERT(false); + std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; + throw std::runtime_error("Unsupported device"); } device_extensions.push_back("VK_KHR_16bit_storage"); @@ -1095,10 +1255,11 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; device_extensions.push_back("VK_KHR_shader_non_semantic_info"); #endif - if (vk_device.fp16) { + if (ctx->device.lock()->fp16) { device_extensions.push_back("VK_KHR_shader_float16_int8"); } - std::cerr << "ggml_vulkan: Using " << vk_device.properties.deviceName << " | uma: " << vk_device.uma << " | fp16: " << vk_device.fp16 << " | warp size: " << vk_device.subgroup_size << std::endl; + ctx->device.lock()->name = ctx->device.lock()->properties.deviceName.data(); + device_create_info = { vk::DeviceCreateFlags(), device_queue_create_infos, @@ -1106,28 +1267,32 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; device_extensions }; device_create_info.setPNext(&device_features2); - vk_device.device = vk_device.physical_device.createDevice(device_create_info); + ctx->device.lock()->device = ctx->device.lock()->physical_device.createDevice(device_create_info); - vk_device.descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN; + ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN; // Shaders - ggml_vk_load_shaders(); + ggml_vk_load_shaders(ctx); // Queues - vk_device.compute_queue = ggml_vk_create_queue(compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); - if (!single_queue) { + ggml_vk_create_queue(ctx, ctx->device.lock()->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); + if (!ctx->device.lock()->single_queue) { const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; - vk_device.transfer_queue = ggml_vk_create_queue(transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); + ggml_vk_create_queue(ctx, ctx->device.lock()->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); } else { - vk_device.transfer_queue = vk_device.compute_queue; + // TODO: Use pointer or reference to avoid copy + ctx->device.lock()->transfer_queue = ctx->device.lock()->compute_queue; } - vk_fence = vk_device.device.createFence({}); + ctx->fence = ctx->device.lock()->device.createFence({}); - vk_ctx = nullptr; - vk_transfer_ctx = nullptr; + ctx->compute_ctx = nullptr; + ctx->transfer_ctx = nullptr; - vk_disable = false; + ctx->disable = false; + ctx->initialized = true; + + ctx->idx = idx; #ifdef GGML_VULKAN_CHECK_RESULTS const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS"); @@ -1137,7 +1302,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl; #endif } -static vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) { +static vk_pipeline* ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_to_fp16()" << std::endl; #endif @@ -1158,10 +1323,10 @@ static vk_pipeline* ggml_vk_get_to_fp16(ggml_type type) { return nullptr; } - return &vk_pipeline_dequant[type]; + return &ctx->pipeline_dequant[type]; } -static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type) { +static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl; #endif @@ -1182,15 +1347,10 @@ static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_type type) { return nullptr; } - return &vk_pipeline_dequant_mul_mat_vec_f32[type]; + return &ctx->pipeline_dequant_mul_mat_vec_f32[type]; } -// buffer pool for vulkan -#define MAX_VK_BUFFERS 256 - -static vk_buffer g_vk_buffer_pool[MAX_VK_BUFFERS]; - -static vk_buffer ggml_vk_pool_malloc(size_t size) { +static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_pool_malloc(" << size << ")" << std::endl; #endif @@ -1199,98 +1359,95 @@ static vk_buffer ggml_vk_pool_malloc(size_t size) { int worst_i = -1; size_t worst_size = 0; //largest unused buffer seen so far for (int i = 0; i < MAX_VK_BUFFERS; ++i) { - vk_buffer &b = g_vk_buffer_pool[i]; - if (b.size > 0 && b.size >= size && b.size < best_size) { + vk_buffer &b = ctx->buffer_pool[i]; + if (b != nullptr && b->size >= size && b->size < best_size) { best_i = i; - best_size = b.size; + best_size = b->size; } - if (b.size > 0 && b.size > worst_size) { + if (b != nullptr && b->size > worst_size) { worst_i = i; - worst_size = b.size; + worst_size = b->size; } } if(best_i != -1) { //found the smallest buffer that fits our needs - vk_buffer b = g_vk_buffer_pool[best_i]; - g_vk_buffer_pool[best_i].size = 0; + vk_buffer b = ctx->buffer_pool[best_i]; + ctx->buffer_pool[best_i].reset(); return b; } if(worst_i != -1) { //no buffer that fits our needs, resize largest one to save memory - vk_buffer& b = g_vk_buffer_pool[worst_i]; + vk_buffer& b = ctx->buffer_pool[worst_i]; ggml_vk_destroy_buffer(b); } - return ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eDeviceLocal); + return ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); } -static void ggml_vk_pool_free(vk_buffer& buffer) { +static void ggml_vk_pool_free(ggml_backend_vk_context * ctx, vk_buffer& buffer) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_pool_free(" << buffer.size << ")" << std::endl; + std::cerr << "ggml_vk_pool_free(" << buffer->size << ")" << std::endl; #endif for (int i = 0; i < MAX_VK_BUFFERS; ++i) { - vk_buffer& b = g_vk_buffer_pool[i]; - if (b.size == 0) { + vk_buffer& b = ctx->buffer_pool[i]; + if (b == nullptr) { b = buffer; - // Set owning queue family index to ignored to avoid synchronization on next use - b.qf_owner = VK_QUEUE_FAMILY_IGNORED; return; } } - fprintf(stderr, "WARNING: vk buffer pool full, increase MAX_VK_BUFFERS\n"); + std::cerr << "ggml_vulkan: WARNING: vk buffer pool full, increase MAX_VK_BUFFERS" << std::endl; ggml_vk_destroy_buffer(buffer); } // Returns an available temporary buffer that may only be used temporarily, it will be reused -static vk_buffer ggml_vk_create_buffer_temp(size_t size) { +static vk_buffer ggml_vk_create_buffer_temp(ggml_backend_vk_context * ctx, size_t size) { // Try to find existing temp buffer with enough capacity - for (auto& buffer : vk_gc.temp_buffers) { - if (buffer.size >= size) { + for (auto& buffer : ctx->gc.temp_buffers) { + if (buffer->size >= size) { return buffer; } } // Otherwise create new buffer - vk_buffer buf = ggml_vk_pool_malloc(size); - vk_gc.temp_buffers.push_back(buf); + vk_buffer buf = ggml_vk_pool_malloc(ctx, size); + ctx->gc.temp_buffers.push_back(buf); return buf; } -static void * ggml_vk_host_malloc(size_t size) { +static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; #endif - vk_buffer buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); - if(!(buf.memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { + if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", size/1024.0/1024.0); - buf.size = 0; - vk_device.device.freeMemory(buf.device_memory); - vk_device.device.destroyBuffer(buf.buffer); + ctx->device.lock()->device.freeMemory(buf->device_memory); + ctx->device.lock()->device.destroyBuffer(buf->buffer); return nullptr; } - vk_pinned_memory.push_back(std::make_tuple(buf.ptr, size, buf)); + ctx->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); - return buf.ptr; + return buf->ptr; } -static void ggml_vk_host_free(void* ptr) { +static void ggml_vk_host_free(ggml_backend_vk_context * ctx, void* ptr) { if (ptr == nullptr) { return; } #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_host_free(" << ptr << ")" << std::endl; #endif - vk_buffer* buf = nullptr; + vk_buffer buf; size_t index; - for (size_t i = 0; i < vk_pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]); + for (size_t i = 0; i < ctx->pinned_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]); + const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]); if (ptr >= addr && ptr < endr) { - buf = &std::get<2>(vk_pinned_memory[i]); + buf = std::get<2>(ctx->pinned_memory[i]); index = i; break; } @@ -1300,28 +1457,28 @@ static void ggml_vk_host_free(void* ptr) { return; } - ggml_vk_destroy_buffer(*buf); + ggml_vk_destroy_buffer(buf); - vk_pinned_memory.erase(vk_pinned_memory.begin() + index); + ctx->pinned_memory.erase(ctx->pinned_memory.begin() + index); } -static void ggml_vk_host_get(const void * ptr, vk_buffer *& buf, size_t& buf_offset) { +static void ggml_vk_host_get(ggml_backend_vk_context * ctx, const void * ptr, vk_buffer& buf, size_t& buf_offset) { buf = nullptr; buf_offset = 0; - for (size_t i = 0; i < vk_pinned_memory.size(); i++) { - const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]); - const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]); + for (size_t i = 0; i < ctx->pinned_memory.size(); i++) { + const uint8_t* addr = (const uint8_t*) std::get<0>(ctx->pinned_memory[i]); + const uint8_t* endr = addr + std::get<1>(ctx->pinned_memory[i]); if (ptr >= addr && ptr < endr) { - buf = &std::get<2>(vk_pinned_memory[i]); + buf = std::get<2>(ctx->pinned_memory[i]); buf_offset = ((const uint8_t *)ptr) - addr; break; } } } -static vk_submission ggml_vk_begin_submission(vk_queue& q, bool one_time = true) { +static vk_submission ggml_vk_begin_submission(ggml_backend_vk_context * ctx, vk_queue& q, bool one_time = true) { vk_submission s; - s.buffer = ggml_vk_create_cmd_buffer(q); + s.buffer = ggml_vk_create_cmd_buffer(ctx, q); if (one_time) { s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit }); } else { @@ -1331,7 +1488,7 @@ static vk_submission ggml_vk_begin_submission(vk_queue& q, bool one_time = true) return s; } -static void ggml_vk_dispatch_pipeline(vk_context * ctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { +static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { const uint32_t wg0 = CEIL_DIV(elements[0], pipeline.wg_denoms[0]); const uint32_t wg1 = CEIL_DIV(elements[1], pipeline.wg_denoms[1]); const uint32_t wg2 = CEIL_DIV(elements[2], pipeline.wg_denoms[2]); @@ -1344,22 +1501,22 @@ static void ggml_vk_dispatch_pipeline(vk_context * ctx, vk_pipeline& pipeline, s GGML_ASSERT(buffers.size() == pipeline.parameter_count); vk::DescriptorSet& descriptor_set = pipeline.descriptor_sets[pipeline.descriptor_set_idx++]; for (uint32_t i = 0; i < pipeline.parameter_count; i++) { - descriptor_buffer_infos.push_back({buffers[i].buffer.buffer, buffers[i].offset, buffers[i].size}); + descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); } for (uint32_t i = 0; i < pipeline.parameter_count; i++) { write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); } - vk_device.device.updateDescriptorSets(write_descriptor_sets, {}); + ctx->device.lock()->device.updateDescriptorSets(write_descriptor_sets, {}); - ctx->s->buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); - ctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline); - ctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, + subctx->s->buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); + subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline); + subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipeline.layout, 0, { descriptor_set }, {}); - ctx->s->buffer.dispatch(wg0, wg1, wg2); + subctx->s->buffer.dispatch(wg0, wg1, wg2); } static void ggml_vk_end_submission(vk_submission& s, std::vector wait_semaphores, std::vector signal_semaphores) { @@ -1381,16 +1538,16 @@ static void ggml_vk_ctx_end(vk_context * ctx) { ctx->s = nullptr; } -static void ggml_vk_ctx_begin(vk_context * ctx) { +static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_ctx_begin(" << ctx << ")" << std::endl; #endif - if (ctx->s != nullptr) { - ggml_vk_ctx_end(ctx); + if (subctx->s != nullptr) { + ggml_vk_ctx_end(subctx); } - ctx->seqs.push_back({ ggml_vk_begin_submission(*ctx->q) }); - ctx->s = ctx->seqs[ctx->seqs.size() - 1].data(); + subctx->seqs.push_back({ ggml_vk_begin_submission(ctx, *subctx->q) }); + subctx->s = subctx->seqs[subctx->seqs.size() - 1].data(); } static size_t ggml_vk_align_size(size_t width, size_t align) { @@ -1405,14 +1562,14 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect } } -static void ensure_sync_staging_buffer(size_t size) { - if (vk_sync_staging.size < size) { - ggml_vk_destroy_buffer(vk_sync_staging); - vk_sync_staging = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); +static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { + if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { + ggml_vk_destroy_buffer(ctx->sync_staging); + ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); } } -static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { +static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl; #endif @@ -1423,9 +1580,9 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size GGML_ASSERT(false); } // Check if src is pinned memory - vk_buffer * buf = nullptr; + vk_buffer buf; size_t buf_offset; - ggml_vk_host_get(tensor->data, buf, buf_offset); + ggml_vk_host_get(ctx, tensor->data, buf, buf_offset); const uint64_t ne0 = tensor->ne[0]; const uint64_t ne1 = tensor->ne[1]; @@ -1471,21 +1628,21 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size } } - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); return; } // Staging buffer required - vk_buffer * staging = &vk_staging; - size_t staging_offset = vk_staging_offset; + vk_buffer staging = ctx->staging; + size_t staging_offset = ctx->staging_offset; const size_t copy_size = ts*ne/bs; - if (vk_staging.size < vk_staging_offset + copy_size) { + if (ctx->staging->size < ctx->staging_offset + copy_size) { if (sync_staging) { // Create temporary larger buffer - ensure_sync_staging_buffer(copy_size); + ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); - staging = &vk_sync_staging; + staging = ctx->sync_staging; staging_offset = 0; } else { GGML_ASSERT(false); @@ -1494,23 +1651,23 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size VkBufferCopy buf_copy{ staging_offset, offset, copy_size }; - ggml_vk_sync_buffers(ctx); - vkCmdCopyBuffer(ctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); + ggml_vk_sync_buffers(subctx); + vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); for (uint64_t i3 = 0; i3 < ne3; i3++) { for (uint64_t i2 = 0; i2 < ne2; i2++) { // Find longest contiguous slice if (ne1*nb1 == dstnb2) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2, dstnb2, &subctx->in_memcpys); } else { for (uint64_t i1 = 0; i1 < ne1; i1++) { if (ne0*nb0/bs == dstnb1) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1, (const uint8_t *) tensor->data + buf_offset + i3*nb3 + i2*nb2 + i1*nb1, dstnb1, &subctx->in_memcpys); } else { const uint64_t s_off = buf_offset + i3*nb3 + i2*nb2 + i1*nb1; const uint64_t d_off = staging_offset + i3*dstnb3 + i2*dstnb2 + i1*dstnb1; for (uint64_t i0 = 0; i0 < ne0; i0++) { - deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + d_off + i0*dstnb0, (const uint8_t *) tensor->data + s_off + i0*nb0, dstnb0, &subctx->in_memcpys); } } } @@ -1519,19 +1676,22 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size } } -static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { +static void ggml_vk_buffer_write_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, bool sync_staging = false) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_2d_async(" << width << ", " << height << ")" << std::endl; #endif + // Make sure ctx owns the buffer + GGML_ASSERT(dst->ctx == ctx); + // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl; GGML_ASSERT(false); } // Check if src is pinned memory - vk_buffer * buf = nullptr; + vk_buffer buf = nullptr; size_t buf_offset; - ggml_vk_host_get(src, buf, buf_offset); + ggml_vk_host_get(ctx, src, buf, buf_offset); if (buf != nullptr) { // Memory is pinned, use as staging buffer @@ -1550,8 +1710,8 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size } } - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); return; } #ifdef GGML_VULKAN_DEBUG @@ -1559,14 +1719,14 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size #endif // Staging buffer required - vk_buffer * staging = &vk_staging; - size_t staging_offset = vk_staging_offset; + vk_buffer staging = ctx->staging; + size_t staging_offset = ctx->staging_offset; const size_t copy_size = width*height; - if (vk_staging.size < vk_staging_offset + copy_size) { + if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) { if (sync_staging) { - ensure_sync_staging_buffer(copy_size); + ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); - staging = &vk_sync_staging; + staging = ctx->sync_staging; staging_offset = 0; } else { GGML_ASSERT(false); @@ -1578,26 +1738,26 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size offset, copy_size}; - ggml_vk_sync_buffers(ctx); - vkCmdCopyBuffer(ctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); + ggml_vk_sync_buffers(subctx); + vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); if (width == spitch) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + staging_offset, src, width * height, &subctx->in_memcpys); } else { for (size_t i = 0; i < height; i++) { - deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &ctx->in_memcpys); + deferred_memcpy((uint8_t *)staging->ptr + staging_offset + i * width, (const uint8_t *) src + i * spitch, width, &subctx->in_memcpys); } } } -static void ggml_vk_buffer_write_async(vk_context * ctx, vk_buffer* dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { +static void ggml_vk_buffer_write_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const void * src, size_t size, bool sync_staging = false) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl; #endif - return ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, size, size, 1, sync_staging); + return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, size, size, 1, sync_staging); } -static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) { +static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write_2d(" << width << ", " << height << ")" << std::endl; #endif @@ -1609,39 +1769,42 @@ static void ggml_vk_buffer_write_2d(vk_buffer* dst, size_t offset, const void * memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); } } else { - vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(ctx); - ggml_vk_buffer_write_2d_async(ctx, dst, offset, src, spitch, width, height, true); - ggml_vk_ctx_end(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, subctx); + ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, spitch, width, height, true); + ggml_vk_ctx_end(subctx); - for (auto& cpy : ctx->in_memcpys) { + for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); } } -static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src, size_t size) { +static void ggml_vk_buffer_write(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, const void * src, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_write(" << size << ")" << std::endl; #endif - ggml_vk_buffer_write_2d(dst, offset, src, 0, size, 1); + ggml_vk_buffer_write_2d(ctx, dst, offset, src, 0, size, 1); } -static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { +static void ggml_vk_buffer_read_2d_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t spitch, size_t dpitch, size_t width, size_t height, bool sync_staging = false) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_read_2d_async(offset=" << offset << ", width=" << width << ", height=" << height << ")" << std::endl; #endif GGML_ASSERT(width > 0); GGML_ASSERT(height > 0); - GGML_ASSERT(src->size > 0); + GGML_ASSERT(src != nullptr); + // Make sure ctx owns the buffer + GGML_ASSERT(src->ctx == ctx); + // Check if dst is pinned memory - vk_buffer * buf = nullptr; + vk_buffer buf = nullptr; size_t buf_offset; - ggml_vk_host_get(dst, buf, buf_offset); + ggml_vk_host_get(ctx, dst, buf, buf_offset); std::vector slices(1); if (width == spitch && width == dpitch) { @@ -1660,8 +1823,8 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_ if (buf != nullptr) { // Memory is pinned, use as staging buffer - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices); return; } @@ -1670,30 +1833,30 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_ #endif // Fall back to staging buffer - vk_buffer * staging = &vk_staging; + vk_buffer staging = ctx->staging; const size_t copy_size = dpitch * height; - if (vk_staging.size < vk_staging_offset + copy_size) { + if (ctx->staging == nullptr || ctx->staging->size < ctx->staging_offset + copy_size) { if (sync_staging) { // Create temporary larger buffer - ensure_sync_staging_buffer(copy_size); + ggml_vk_ensure_sync_staging_buffer(ctx, copy_size); - staging = &vk_sync_staging; + staging = ctx->sync_staging; } else { GGML_ASSERT(false); } } - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(src->buffer, staging->buffer, slices); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(src->buffer, staging->buffer, slices); - deferred_memcpy(dst, staging->ptr, copy_size, &ctx->out_memcpys); + deferred_memcpy(dst, staging->ptr, copy_size, &subctx->out_memcpys); } -static void ggml_vk_buffer_read_async(vk_context * ctx, vk_buffer* src, size_t offset, void * dst, size_t size, bool sync_staging = false) { - return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst, size, size, size, 1, sync_staging); +static void ggml_vk_buffer_read_async(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, void * dst, size_t size, bool sync_staging = false) { + return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst, size, size, size, 1, sync_staging); } -static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size) { +static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, size_t offset, void * dst, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_read(" << offset << ", " << size << ")" << std::endl; #endif @@ -1702,61 +1865,88 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_ memcpy(dst, (uint8_t *) src->ptr + offset, size); } else { - vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(ctx); - ggml_vk_buffer_read_async(ctx, src, offset, dst, size, true); - ggml_vk_ctx_end(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, subctx); + ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst, size, true); + ggml_vk_ctx_end(subctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); - for (auto& cpy : ctx->out_memcpys) { + for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } } } -static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size) { +static void ggml_vk_buffer_copy_async(vk_context * ctx, vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_copy_async(" << size << ")" << std::endl; #endif + // Make sure both buffers are on same ctx + GGML_ASSERT(src->ctx == dst->ctx); + VkBufferCopy bc{ src_offset, dst_offset, size }; vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc); } -static void ggml_vk_buffer_copy(vk_buffer * dst, size_t dst_offset, vk_buffer * src, size_t src_offset, size_t size) { +static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { + if (src->ctx == dst->ctx) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_buffer_copy(" << size << ")" << std::endl; + std::cerr << "ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")" << std::endl; #endif - VkBufferCopy bc{ src_offset, dst_offset, size }; + // Copy within the device + ggml_backend_vk_context * ctx = src->ctx; - vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(ctx); - vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc); - ggml_vk_buffer_copy_async(ctx, dst, dst_offset, src, src_offset, size); - ggml_vk_ctx_end(ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); - vk_device.device.resetFences({ vk_fence }); + VkBufferCopy bc{ src_offset, dst_offset, size }; + + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, subctx); + ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); + } else { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl; +#endif + // Copy device to device + ggml_backend_vk_context * src_ctx = src->ctx; + ggml_backend_vk_context * dst_ctx = dst->ctx; + + ggml_vk_ensure_sync_staging_buffer(src_ctx, size); + ggml_vk_ensure_sync_staging_buffer(dst_ctx, size); + + // Copy to src staging buffer + ggml_vk_buffer_copy(src_ctx->sync_staging, 0, src, src_offset, size); + // memcpy to dst staging buffer + memcpy(dst_ctx->sync_staging->ptr, src_ctx->sync_staging->ptr, size); + // Copy to dst buffer + ggml_vk_buffer_copy(dst, dst_offset, dst_ctx->sync_staging, 0, size); + } } -static void ggml_vk_buffer_memset(vk_buffer* dst, size_t offset, uint32_t c, size_t size) { +static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, size_t offset, uint32_t c, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")" << std::endl; #endif - vk_context * ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(ctx); - ctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); - ggml_vk_ctx_end(ctx); + // Make sure ctx owns the buffer + GGML_ASSERT(dst->ctx == ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "vk_memset waitForFences"); - vk_device.device.resetFences({ vk_fence }); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, subctx); + subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); + ggml_vk_ctx_end(subctx); + + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); } -static void ggml_vk_h2d_tensor_2d(vk_context * ctx, vk_buffer * dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) { +static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_h2d_tensor_2d(dst=" << dst << ", offset=" << offset << ", src=" << src << ", i3=" << i3 << ", i2=" << i2 << ", i1=" << i1 << ")" << std::endl; #endif @@ -1773,20 +1963,20 @@ static void ggml_vk_h2d_tensor_2d(vk_context * ctx, vk_buffer * dst, size_t offs const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); if (nb0 == ts && nb1 == row_length) { - return ggml_vk_buffer_write_async(ctx, dst, offset, x, i1*nb1); + return ggml_vk_buffer_write_async(ctx, subctx, dst, offset, x, i1*nb1); } if (nb0 == ts && (i1 == ne1 || !ggml_is_permuted(src))) { - return ggml_vk_buffer_write_2d_async(ctx, dst, offset, x, nb1, row_length, i1); + return ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, x, nb1, row_length, i1); } GGML_ASSERT(i3 == 0); GGML_ASSERT(i2 == 0); GGML_ASSERT(i1 == (uint64_t) ggml_nrows(src)); - return ggml_vk_buffer_write_nc_async(ctx, dst, offset, src); + return ggml_vk_buffer_write_nc_async(ctx, subctx, dst, offset, src); } -static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offset, const ggml_tensor * dst) { +static void ggml_vk_d2h_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& src, size_t offset, const ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_d2h_tensor_2d()" << std::endl; #endif @@ -1804,10 +1994,10 @@ static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offs const size_t row_length = ts*ne0/bs; if (ggml_is_contiguous(dst)) { - return ggml_vk_buffer_read_async(ctx, src, offset, dst->data, ne1*nb1*ne2*ne3); + return ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst->data, ne1*nb1*ne2*ne3); } if (nb0 == ts) { - return ggml_vk_buffer_read_2d_async(ctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3); + return ggml_vk_buffer_read_2d_async(ctx, subctx, src, offset, dst->data, nb1, nb1, row_length, ne1*ne2*ne3); } GGML_ASSERT(false); } @@ -1829,89 +2019,89 @@ static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { return 1; } -static uint32_t ggml_vk_guess_matmul_pipeline_align(int m, int n) { +static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, int m, int n) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl; #endif if (m <= 32 || n <= 32) { - return vk_pipeline_matmul_f32_aligned_s.align; + return ctx->pipeline_matmul_f32_aligned_s.align; } - if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { - return vk_pipeline_matmul_f32_aligned_m.align; + if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + return ctx->pipeline_matmul_f32_aligned_m.align; } - return vk_pipeline_matmul_f32_aligned_l.align; + return ctx->pipeline_matmul_f32_aligned_l.align; } -static vk_pipeline* ggml_vk_guess_matmul_pipeline(bool bit16_x, bool bit16_y, int m, int n, bool aligned) { +static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")"; #endif if (bit16_x && bit16_y) { - if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_aligned_s : &vk_pipeline_matmul_f16_s; + return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; } - if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_aligned_m : &vk_pipeline_matmul_f16_m; + return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_aligned_l : &vk_pipeline_matmul_f16_l; + return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l; } if (bit16_x && !bit16_y) { - if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_f32_aligned_s : &vk_pipeline_matmul_f16_f32_s; + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; } - if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_f32_aligned_m : &vk_pipeline_matmul_f16_f32_m; + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f16_f32_aligned_l : &vk_pipeline_matmul_f16_f32_l; + return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_l : &ctx->pipeline_matmul_f16_f32_l; } if (!bit16_x && bit16_y) { GGML_ASSERT(false); } - if (vk_device.vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f32_aligned_s : &vk_pipeline_matmul_f32_s; + return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; } - if (vk_device.subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f32_aligned_m : &vk_pipeline_matmul_f32_m; + return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &vk_pipeline_matmul_f32_aligned_l : &vk_pipeline_matmul_f32_l; + return aligned ? &ctx->pipeline_matmul_f32_aligned_l : &ctx->pipeline_matmul_f32_l; } -static void ggml_vk_matmul(vk_context * ctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) { +static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_matmul(a: (" << a.buffer.buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer.buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer.buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer.buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl; + std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl; #endif - ggml_vk_sync_buffers(ctx); + ggml_vk_sync_buffers(subctx); if (split_k == 1) { const std::array pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; - ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch }); return; } @@ -1919,10 +2109,10 @@ static void ggml_vk_matmul(vk_context * ctx, vk_pipeline& pipeline, vk_subbuffer const std::array pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; // Make sure enough workgroups get assigned for split k to work - ggml_vk_dispatch_pipeline(ctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch }); - ggml_vk_sync_buffers(ctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch }); + ggml_vk_sync_buffers(subctx); const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; - ggml_vk_dispatch_pipeline(ctx, vk_pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); } static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { @@ -1932,32 +2122,32 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static vk_pipeline * ggml_vk_get_cpy_pipeline(ggml_type from, ggml_type to) { +static vk_pipeline * ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) { if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) { - return &vk_pipeline_cpy_f32_f32; + return &ctx->pipeline_cpy_f32_f32; } if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) { - return &vk_pipeline_cpy_f32_f16; + return &ctx->pipeline_cpy_f32_f16; } if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) { - return &vk_pipeline_cpy_f16_f16; + return &ctx->pipeline_cpy_f16_f16; } std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; GGML_ASSERT(false); } -static void ggml_vk_cpy_to_contiguous(vk_context * ctx, vk_pipeline * pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) { +static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline * pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; - std::cerr << "buffer in size=" << in.buffer.size << ", buffer out size=" << out.buffer.size << ")" << std::endl; + std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl; #endif const int tensor_type_size = ggml_type_size(tensor->type); const int dst_type_size = ggml_type_size(buffer_type); const uint32_t ne = tensor->ne[0] * tensor->ne[1] * tensor->ne[2]; - const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], vk_device.properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1]; + const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1]; const vk_op_cpy_push_constants pc = { (uint32_t)ne, @@ -1965,11 +2155,11 @@ static void ggml_vk_cpy_to_contiguous(vk_context * ctx, vk_pipeline * pipeline, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], 1 , (uint32_t)tensor->ne[0] , nb2, 0, }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 }); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 }); } -static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -1998,17 +2188,17 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer * d_Qx = nullptr; + vk_buffer d_Qx; size_t qx_buf_offset = 0; - vk_buffer * d_Qy = nullptr; + vk_buffer d_Qy; size_t qy_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset); - ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src0_uma = d_Qx != nullptr; src1_uma = d_Qy != nullptr; } @@ -2031,12 +2221,12 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ne01, ne11)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, ne01, ne11)); const bool aligned = ne10 == kpad; const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); - vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(true, !f16_f32_kernel, ne01, ne11, aligned); + vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(ctx, true, !f16_f32_kernel, ne01, ne11, aligned); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); @@ -2044,30 +2234,30 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; - vk_buffer* d_D = &extra->buffer_gpu; + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); - vk_buffer* d_X; + vk_buffer d_X; uint64_t x_buf_offset = 0; - vk_buffer* d_Y; + vk_buffer d_Y; uint64_t y_buf_offset = 0; if (load_x) { - d_Qx = &vk_prealloc_qx; + d_Qx = ctx->prealloc_qx; } else if (!src0_uma) { - d_Qx = &extra_src0->buffer_gpu; + d_Qx = extra_src0->buffer_gpu.lock(); qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); } if (load_y) { - d_Qy = &vk_prealloc_qy; + d_Qy = ctx->prealloc_qy; } else if (!src1_uma) { - d_Qy = &extra_src1->buffer_gpu; + d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { - d_X = &vk_prealloc_x; + d_X = ctx->prealloc_x; GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03); } else { d_X = d_Qx; @@ -2075,7 +2265,7 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co GGML_ASSERT(qx_sz == x_sz); // NOLINT } if (qy_needs_dequant) { - d_Y = &vk_prealloc_y; + d_Y = ctx->prealloc_y; GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03); } else { d_Y = d_Qy; @@ -2087,49 +2277,49 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co vk_pipeline * to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(src0->type, GGML_TYPE_F16); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); } else { - to_fp16_vk_0 = ggml_vk_get_to_fp16(src0->type); + to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(src1->type, GGML_TYPE_F16); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type); + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT // Allocate descriptor sets - ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, ne12 * ne13); if (qx_needs_dequant) { - ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_0, x_non_contig ? 1 : ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, x_non_contig ? 1 : ne12 * ne13); } if (qy_needs_dequant) { - ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); } if (split_k > 1) { - ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, ne12 * ne13); } if (x_non_contig) { - ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_0, src0, { *d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { *d_X, 0, VK_WHOLE_SIZE }, dst->type, false); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, dst->type, false); } else if (load_x || qx_needs_dequant) { if (load_x) { // copy data to device - ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); - vk_staging_offset = qx_sz * ne02 * ne03; + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); + ctx->staging_offset = qx_sz * ne02 * ne03; } if (qx_needs_dequant) { const std::vector pc = { (int)ne01, (int)ne10, (int)ne10, (int)ne10 }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *to_fp16_vk_0, { { *d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { *d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } } if (y_non_contig) { - ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_1, src1, { *d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { *d_Y, 0, VK_WHOLE_SIZE }, dst->type); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, dst->type); } else if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); } uint32_t stride_batch_x = ne00*ne01; @@ -2144,16 +2334,16 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co } // compute - ggml_vk_matmul(ctx, *pipeline, { *d_X, x_buf_offset, x_sz * ne02 * ne03 }, { *d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { *d_D, d_buf_offset, d_sz * ne12 * ne13 }, { vk_prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT + ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data); - ggml_vk_buffer_read_async(ctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13); + ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13); } } -static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -2184,17 +2374,17 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer * d_Qx = nullptr; + vk_buffer d_Qx; size_t qx_buf_offset = 0; - vk_buffer * d_Qy = nullptr; + vk_buffer d_Qy; size_t qy_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset); - ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src0_uma = d_Qx != nullptr; src1_uma = d_Qy != nullptr; } @@ -2214,42 +2404,42 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 const uint64_t y_ne = ne11 * ne10; const uint64_t d_ne = ne11 * ne01; - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment); + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); - const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) : qx_sz; + const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; - vk_buffer* d_D = &extra->buffer_gpu; + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_X; + vk_buffer d_X; uint64_t x_buf_offset = 0; - vk_buffer* d_Y; + vk_buffer d_Y; uint64_t y_buf_offset = 0; if (load_x) { - d_Qx = &vk_prealloc_qx; + d_Qx = ctx->prealloc_qx; } else if(!src1_uma) { - d_Qx = &extra_src0->buffer_gpu; + d_Qx = extra_src0->buffer_gpu.lock(); qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); } if (load_y) { - d_Qy = &vk_prealloc_qy; + d_Qy = ctx->prealloc_qy; } else if(!src1_uma) { - d_Qy = &extra_src1->buffer_gpu; + d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qy != nullptr); } if (qx_needs_dequant) { - d_X = &vk_prealloc_x; + d_X = ctx->prealloc_x; } else { d_X = d_Qx; x_buf_offset = qx_buf_offset; GGML_ASSERT(qx_sz == x_sz); } if (qy_needs_dequant) { - d_Y = &vk_prealloc_y; + d_Y = ctx->prealloc_y; } else { d_Y = d_Qy; y_buf_offset = qy_buf_offset; @@ -2259,39 +2449,39 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 vk_pipeline * to_fp16_vk_0 = nullptr; vk_pipeline* to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(src0->type, src0->type); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(src1->type, src1->type); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); } else { - to_fp16_vk_1 = ggml_vk_get_to_fp16(src1->type); + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } - vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(src0->type); + vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type); GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); // Allocate descriptor sets if (qx_needs_dequant) { - ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_0, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, 1); } if (qy_needs_dequant) { - ggml_vk_pipeline_allocate_descriptor_sets(*to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); } - ggml_vk_pipeline_allocate_descriptor_sets(*dmmv, ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, *dmmv, ne12 * ne13); if (x_non_contig) { - GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment)); - ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_0, src0, { *d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { *d_X, 0, VK_WHOLE_SIZE }, src0->type); + GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment)); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, src0->type); } else if (load_x) { // copy data to device - ggml_vk_h2d_tensor_2d(ctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); } if (y_non_contig) { GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); - ggml_vk_cpy_to_contiguous(ctx, to_fp16_vk_1, src1, { *d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { *d_Y, 0, VK_WHOLE_SIZE }, src1->type); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, src1->type); } else if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); } for (uint64_t i13 = 0; i13 < ne13; i13++) { @@ -2306,34 +2496,34 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0 const uint64_t y_offset = y_buf_offset + y_sz * it_idx1; const uint64_t d_offset = d_buf_offset + d_sz * it_idx1; - const uint64_t y_buffer_offset = (y_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t y_buffer_offset = (y_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t y_shader_offset = y_offset - y_buffer_offset; - const uint64_t d_buffer_offset = (d_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_offset - d_buffer_offset; if (!y_non_contig && qy_needs_dequant) { const std::vector pc = { (int)ne11, (int)ne10, (int)ne10, (int)ne10 }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *to_fp16_vk_1, { { *d_Qy, qy_offset, qy_sz }, { *d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1}); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)y_ne, 1, 1}); } // compute const std::array pc = { (int)ne00, (int)(y_shader_offset / ggml_type_size(src1->type)), (int)(d_shader_offset / ggml_type_size(dst->type))}; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *dmmv, { { *d_X, x_offset, x_sz }, { *d_Y, y_buffer_offset, y_sz + y_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - ggml_vk_sync_buffers(ctx); - ggml_vk_buffer_read_async(ctx, d_D, d_offset, d, sizeof(float) * d_ne); + ggml_vk_sync_buffers(subctx); + ggml_vk_buffer_read_async(ctx, subctx, d_D, d_offset, d, sizeof(float) * d_ne); } } } } -static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -2362,13 +2552,13 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer * d_Qy = nullptr; + vk_buffer d_Qy; size_t qy_buf_offset = 0; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src1_uma = d_Qy != nullptr; } @@ -2378,51 +2568,51 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor const uint64_t y_ne = ne10 * ne11 * ne12; const uint64_t d_ne = ne01 * ne11 * ne12; - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment); + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t d_sz = sizeof(float) * d_ne; - vk_buffer* d_D = &extra->buffer_gpu; + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_Qx = &extra_src0->buffer_gpu; + vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); const uint64_t qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); if (load_y) { - d_Qy = &vk_prealloc_qy; + d_Qy = ctx->prealloc_qy; } else if (!src1_uma) { - d_Qy = &extra_src1->buffer_gpu; + d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qx != nullptr); } // Allocate descriptor sets - ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_mul_mat_vec_p021_f16_f32, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, 1); - const uint64_t qy_buffer_offset = (qy_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; - const uint64_t d_buffer_offset = (d_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); } // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, vk_pipeline_mul_mat_vec_p021_f16_f32, { { *d_Qx, qx_buf_offset, qx_sz }, { *d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) dst->data; - ggml_vk_sync_buffers(ctx); - ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne); + ggml_vk_sync_buffers(subctx); + ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne); } } -static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -2454,13 +2644,13 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; - vk_buffer * d_Qy = nullptr; + vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src1_uma = d_Qy != nullptr; } @@ -2475,43 +2665,43 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor * const uint64_t qy_sz = ggml_nbytes(src1); const uint64_t d_sz = sizeof(float) * d_ne; - vk_buffer* d_D = &extra->buffer_gpu; + vk_buffer d_D = extra->buffer_gpu.lock(); const uint64_t d_buf_offset = extra->offset; GGML_ASSERT(d_D != nullptr); - vk_buffer* d_Qx = &extra_src0->buffer_gpu; + vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); const uint64_t qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); if (load_y) { - d_Qy = &vk_prealloc_qy; + d_Qy = ctx->prealloc_qy; } else { - d_Qy = &extra_src1->buffer_gpu; + d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qx != nullptr); } // Allocate descriptor sets - ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_mul_mat_vec_nc_f16_f32, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, 1); - const uint64_t qy_buffer_offset = (qy_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; - const uint64_t d_buffer_offset = (d_buf_offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); } // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, vk_pipeline_mul_mat_vec_nc_f16_f32, { { *d_Qx, qx_buf_offset, qx_sz }, { *d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { *d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) dst->data; - ggml_vk_sync_buffers(ctx); - ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset, d, sizeof(float) * d_ne); + ggml_vk_sync_buffers(subctx); + ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne); } } @@ -2528,22 +2718,22 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU); } -static void ggml_vk_mul_mat(vk_context * ctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl; #endif if (src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { - ggml_vk_mul_mat_vec_p021_f16_f32(ctx, src0, src1, dst); + ggml_vk_mul_mat_vec_p021_f16_f32(ctx, subctx, src0, src1, dst); } else if (src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { - ggml_vk_mul_mat_vec_nc_f16_f32(ctx, src0, src1, dst); + ggml_vk_mul_mat_vec_nc_f16_f32(ctx, subctx, src0, src1, dst); } else if (src1->ne[1] == 1 && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) { - ggml_vk_mul_mat_vec_q_f16(ctx, src0, src1, dst); + ggml_vk_mul_mat_vec_q_f16(ctx, subctx, src0, src1, dst); } else { - ggml_vk_mul_mat_q_f16(ctx, src0, src1, dst); + ggml_vk_mul_mat_q_f16(ctx, subctx, src0, src1, dst); } } -static void ggml_vk_op_repeat(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { // guaranteed to be an integer due to the check in ggml_can_repeat const uint64_t ne0 = dst->ne[0]; const uint64_t ne1 = dst->ne[1]; @@ -2579,9 +2769,9 @@ static void ggml_vk_op_repeat(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - const vk_buffer* src_buf = &extra_src0->buffer_gpu; + const vk_buffer src_buf = extra_src0->buffer_gpu.lock(); const uint64_t src_offset = extra_src0->offset; - vk_buffer* dst_buf = &extra->buffer_gpu; + vk_buffer dst_buf = extra->buffer_gpu.lock(); const uint64_t dst_offset = extra->offset; std::vector copies; @@ -2606,78 +2796,79 @@ static void ggml_vk_op_repeat(vk_context * ctx, const ggml_tensor * src0, const } } - ggml_vk_sync_buffers(ctx); - ctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies); + ggml_vk_sync_buffers(subctx); + subctx->s->buffer.copyBuffer(src_buf->buffer, dst_buf->buffer, copies); - (void) src1; + GGML_UNUSED(ctx); + GGML_UNUSED(src1); } -static vk_pipeline* ggml_vk_op_get_pipeline(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) { +static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_ADD: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_add_f32; + return &ctx->pipeline_add_f32; } return nullptr; case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); if (dst->type == GGML_TYPE_F16) { - return &vk_pipeline_get_rows[src0->type]; + return &ctx->pipeline_get_rows[src0->type]; } if (dst->type == GGML_TYPE_F32) { - return &vk_pipeline_get_rows_f32[src0->type]; + return &ctx->pipeline_get_rows_f32[src0->type]; } return nullptr; case GGML_OP_MUL: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_mul_f32; + return &ctx->pipeline_mul_f32; } return nullptr; case GGML_OP_SCALE: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_scale_f32; + return &ctx->pipeline_scale_f32; } return nullptr; case GGML_OP_SQR: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_sqr_f32; + return &ctx->pipeline_sqr_f32; } return nullptr; case GGML_OP_CLAMP: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_clamp_f32; + return &ctx->pipeline_clamp_f32; } return nullptr; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - return ggml_vk_get_cpy_pipeline(src0->type, dst->type); + return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type); case GGML_OP_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_norm_f32; + return &ctx->pipeline_norm_f32; } return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_rms_norm_f32; + return &ctx->pipeline_rms_norm_f32; } return nullptr; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_SILU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_silu_f32; + return &ctx->pipeline_silu_f32; } break; case GGML_UNARY_OP_GELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_gelu_f32; + return &ctx->pipeline_gelu_f32; } break; case GGML_UNARY_OP_RELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_relu_f32; + return &ctx->pipeline_relu_f32; } break; default: @@ -2686,12 +2877,12 @@ static vk_pipeline* ggml_vk_op_get_pipeline(const ggml_tensor * src0, const ggml return nullptr; case GGML_OP_DIAG_MASK_INF: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_diag_mask_inf_f32; + return &ctx->pipeline_diag_mask_inf_f32; } return nullptr; case GGML_OP_SOFT_MAX: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_soft_max_f32; + return &ctx->pipeline_soft_max_f32; } return nullptr; case GGML_OP_ROPE: @@ -2706,17 +2897,17 @@ static vk_pipeline* ggml_vk_op_get_pipeline(const ggml_tensor * src0, const ggml if (is_neox) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_rope_neox_f32; + return &ctx->pipeline_rope_neox_f32; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return &vk_pipeline_rope_neox_f16; + return &ctx->pipeline_rope_neox_f16; } } else { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &vk_pipeline_rope_f32; + return &ctx->pipeline_rope_f32; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return &vk_pipeline_rope_f16; + return &ctx->pipeline_rope_f16; } } return nullptr; @@ -2735,13 +2926,8 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) { } } -#ifdef GGML_VULKAN_CHECK_RESULTS -static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name); -static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor); -#endif - template -static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { @@ -2768,7 +2954,7 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm const uint64_t nb2 = dst->nb[2]; const uint64_t nb3 = dst->nb[3]; - vk_pipeline * pipeline = ggml_vk_op_get_pipeline(src0, src1, dst, op); + vk_pipeline * pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op); ggml_vk_func_t op_func; if (pipeline == nullptr) { @@ -2782,7 +2968,7 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm GGML_ASSERT(false); } - op_func(ctx, src0, src1, dst); + op_func(ctx, subctx, src0, src1, dst); return; } @@ -2790,19 +2976,19 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - vk_buffer * d_X = nullptr; + vk_buffer d_X = nullptr; size_t x_buf_offset = 0; - vk_buffer * d_Y = nullptr; + vk_buffer d_Y = nullptr; size_t y_buf_offset = 0; bool src0_uma = false; bool src1_uma = false; - if (vk_device.uma) { - ggml_vk_host_get(src0->data, d_X, x_buf_offset); + if (ctx->device.lock()->uma) { + ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset); src0_uma = d_X != nullptr; if (use_src1) { - ggml_vk_host_get(src1->data, d_Y, y_buf_offset); + ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset); src1_uma = d_Y != nullptr; } } @@ -2810,30 +2996,31 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma; const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma; - uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, vk_device.properties.limits.minStorageBufferOffsetAlignment); - uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, vk_device.properties.limits.minStorageBufferOffsetAlignment) : 0; + uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); + uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0; uint64_t d_sz = ggml_type_size(dst->type) * ne0; + vk_buffer d_D = extra->buffer_gpu.lock(); + // Workaround for tiny tensor inputs on ROPE - if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > extra_src1->buffer_gpu.size) { + if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) { y_sz = VK_WHOLE_SIZE; } - vk_buffer* d_D = &extra->buffer_gpu; GGML_ASSERT(d_D != nullptr); - uint64_t d_buf_offset = (extra->offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment; + uint64_t d_buf_offset = (extra->offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT if (transfer_src0) { - d_X = &vk_prealloc_qx; + d_X = ctx->prealloc_qx; } else if(!src0_uma) { - d_X = &extra_src0->buffer_gpu; + d_X = extra_src0->buffer_gpu.lock(); x_buf_offset = extra_src0->offset; GGML_ASSERT(d_X != nullptr); } if (transfer_src1) { - d_Y = &vk_prealloc_qy; + d_Y = ctx->prealloc_qy; } else if (use_src1 && !src1_uma) { - d_Y = &extra_src1->buffer_gpu; + d_Y = extra_src1->buffer_gpu.lock(); y_buf_offset = extra_src1->offset; GGML_ASSERT(d_Y != nullptr); } @@ -2856,16 +3043,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm // copy src0 to device if (transfer_src0) { - ggml_vk_h2d_tensor_2d(ctx, d_X, 0, src0, 0, 0, ggml_nrows(src0)); - vk_staging_offset = x_sz * ne02 * ne03; + ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0)); + ctx->staging_offset = x_sz * ne02 * ne03; } if (transfer_src1) { - ggml_vk_h2d_tensor_2d(ctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1)); + ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1)); } // Single call if dimension 2 is contiguous if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { - ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, 1); switch (dst->op) { case GGML_OP_NORM: @@ -2896,24 +3083,24 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm if (!use_src1 && op == GGML_OP_SOFT_MAX) { // Empty src1 is possible on soft_max, but the shader needs a buffer - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset, x_sz }, { vk_prealloc_y, 0, vk_prealloc_y.size }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset, x_sz }, { *d_Y, y_buf_offset, y_sz }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else { - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset, x_sz }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) { - ggml_vk_d2h_tensor_2d(ctx, d_D, 0, dst); + ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst); } else if(dst->backend == GGML_BACKEND_CPU) { // copy dst to host float * d = (float *) dst->data; - ggml_vk_buffer_read_async(ctx, d_D, 0, d, d_sz); + ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz); } } else { - ggml_vk_pipeline_allocate_descriptor_sets(*pipeline, ne02 * ne03); + ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, ne02 * ne03); switch (dst->op) { case GGML_OP_NORM: @@ -2940,60 +3127,60 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm if (!use_src1 && op == GGML_OP_SOFT_MAX) { // Empty src1 is possible on soft_max, but the shader needs a buffer - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset, x_sz }, { vk_prealloc_y, 0, vk_prealloc_y.size }, { *d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset + x_offset, x_sz }, { *d_Y, y_buf_offset + y_offset, y_sz }, { *d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } else { - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, *pipeline, { { *d_X, x_buf_offset + x_offset, x_sz }, { *d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host - ggml_vk_buffer_read_async(ctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz); + ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz); } } } } } -static void ggml_vk_repeat(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } -static void ggml_vk_get_rows(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } -static void ggml_vk_add(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_ADD, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ADD, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } -static void ggml_vk_mul(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_MUL, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); +static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_MUL, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); } -static void ggml_vk_scale(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_SCALE, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }); } -static void ggml_vk_sqr(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_SQR, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); +static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_clamp(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_CLAMP, { (uint32_t)ggml_nelements(src0), 0, op_params[0], op_params[1] }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, { (uint32_t)ggml_nelements(src0), 0, op_params[0], op_params[1] }); } -static void ggml_vk_cpy(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const int src0_type_size = ggml_type_size(src0->type); const int dst_type_size = ggml_type_size(dst->type); - const uint32_t d_offset = (extra->offset % vk_device.properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_CPY, { + const uint32_t d_offset = (extra->offset % ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, @@ -3001,30 +3188,30 @@ static void ggml_vk_cpy(vk_context * ctx, const ggml_tensor * src0, ggml_tensor }); } -static void ggml_vk_norm(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }); +static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }); } -static void ggml_vk_rms_norm(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } -static void ggml_vk_unary(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); +static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }); } -static void ggml_vk_diag_mask_inf(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { int32_t * op_params = (int32_t *)dst->op_params; - ggml_vk_op_f32(ctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }); } -static void ggml_vk_soft_max(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_SOFT_MAX, { (uint32_t)src0->ne[0], (uint32_t)(src1 != nullptr ? ggml_nrows(src1) : 0), op_params[0], 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, { (uint32_t)src0->ne[0], (uint32_t)(src1 != nullptr ? ggml_nrows(src1) : 0), op_params[0], 0.0f }); } -static void ggml_vk_rope(vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; @@ -3047,19 +3234,19 @@ static void ggml_vk_rope(vk_context * ctx, const ggml_tensor * src0, const ggml_ if (is_neox) { const float theta_scale = powf(freq_base, -2.0f/n_dims); const float inv_ndims = -1.0f / n_dims; - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims }); + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims }); } else { - ggml_vk_op_f32(ctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f }); } } -static void ggml_vk_nop(vk_context * ctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { // If backend is CPU, data from src0 has to be copied off the device if (dst->backend == GGML_BACKEND_CPU) { ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - vk_buffer * d_D = &extra_src0->buffer_gpu; - ggml_vk_sync_buffers(ctx); - ggml_vk_buffer_read_async(ctx, d_D, 0, dst->data, d_D->size); + vk_buffer d_D = extra_src0->buffer_gpu.lock(); + ggml_vk_sync_buffers(subctx); + ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size); } } @@ -3096,7 +3283,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 } template -static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { +static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t n, size_t k, size_t batch, size_t num_it, int split_k, int shader_size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_test_matmul(" << m << ", " << n << ", " << k << ", " << batch << ", " << num_it << ", " << split_k << ", " << shader_size << ")" << std::endl; #endif @@ -3108,39 +3295,39 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size std::string shname; if (shader_size == 0) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_aligned_s; + p = &ctx->pipeline_matmul_f32_aligned_s; shname = "F32_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_aligned_s; + p = &ctx->pipeline_matmul_f16_f32_aligned_s; shname = "F16_F32_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_aligned_s; + p = &ctx->pipeline_matmul_f16_aligned_s; shname = "F16_ALIGNED_S"; } else { GGML_ASSERT(false); } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_aligned_m; + p = &ctx->pipeline_matmul_f32_aligned_m; shname = "F32_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_aligned_m; + p = &ctx->pipeline_matmul_f16_f32_aligned_m; shname = "F16_F32_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_aligned_m; + p = &ctx->pipeline_matmul_f16_aligned_m; shname = "F16_ALIGNED_M"; } else { GGML_ASSERT(false); } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_aligned_l; + p = &ctx->pipeline_matmul_f32_aligned_l; shname = "F32_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_aligned_l; + p = &ctx->pipeline_matmul_f16_f32_aligned_l; shname = "F16_F32_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_aligned_l; + p = &ctx->pipeline_matmul_f16_aligned_l; shname = "F16_ALIGNED_L"; } else { GGML_ASSERT(false); @@ -3154,56 +3341,56 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size if (k != kpad) { if (shader_size == 0) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_s; + p = &ctx->pipeline_matmul_f32_s; shname = "F32_S"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_s; + p = &ctx->pipeline_matmul_f16_f32_s; shname = "F16_F32_S"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_s; + p = &ctx->pipeline_matmul_f16_s; shname = "F16_S"; } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_m; + p = &ctx->pipeline_matmul_f32_m; shname = "F32_M"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_m; + p = &ctx->pipeline_matmul_f16_f32_m; shname = "F16_F32_M"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_m; + p = &ctx->pipeline_matmul_f16_m; shname = "F16_M"; } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f32_l; + p = &ctx->pipeline_matmul_f32_l; shname = "F32_L"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_f32_l; + p = &ctx->pipeline_matmul_f16_f32_l; shname = "F16_F32_L"; } else if (std::is_same() && std::is_same()) { - p = &vk_pipeline_matmul_f16_l; + p = &ctx->pipeline_matmul_f16_l; shname = "F16_L"; } } } - ggml_vk_pipeline_allocate_descriptor_sets(*p, num_it); + ggml_pipeline_allocate_descriptor_sets(ctx, *p, num_it); if (split_k > 1) { - ggml_vk_pipeline_allocate_descriptor_sets(vk_pipeline_matmul_split_k_reduce, num_it); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, num_it); - if (vk_prealloc_split_k.size < sizeof(float) * d_ne * split_k) { + if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer - if (vk_prealloc_split_k.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_split_k); + if (ctx->prealloc_split_k != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_split_k); } - vk_prealloc_split_k = ggml_vk_create_buffer_check(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); + ctx->prealloc_split_k = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal); } } - vk_buffer d_X = ggml_vk_create_buffer_check(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); - vk_buffer d_Y = ggml_vk_create_buffer_check(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); - vk_buffer d_D = ggml_vk_create_buffer_check(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_X = ggml_vk_create_buffer_check(ctx, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_Y = ggml_vk_create_buffer_check(ctx, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer d_D = ggml_vk_create_buffer_check(ctx, sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal); X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne); Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne); @@ -3228,26 +3415,26 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size } } - ggml_vk_buffer_write(&d_X, 0, x, sizeof(X_TYPE) * k * m * batch); - ggml_vk_buffer_write(&d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); + ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch); + ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); - vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); for (size_t i = 0; i < num_it; i++) { - ggml_vk_ctx_begin(ctx); - ggml_vk_matmul(ctx, *p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(vk_prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n); - ggml_vk_ctx_end(ctx); + ggml_vk_ctx_begin(ctx, subctx); + ggml_vk_matmul(ctx, subctx, *p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n); + ggml_vk_ctx_end(subctx); } auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); double time = std::chrono::duration_cast(end-begin).count() / 1000.0; // copy dst to host - ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne); + ggml_vk_buffer_read(ctx, d_D, 0, d, sizeof(float) * d_ne); float * d_chk = (float *) malloc(sizeof(float) * d_ne); @@ -3285,14 +3472,14 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size src1_ggml->data = y; tensor_ggml->data = d_chk; - vk_disable = true; + ctx->disable = true; ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); ggml_build_forward_expand(cgraph, tensor_ggml); ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1); - vk_disable = false; + ctx->disable = false; ggml_free(ggml_ctx); @@ -3325,7 +3512,7 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size if (split_k > 1) { float * split_k_buf = (float *) malloc(sizeof(float) * d_ne * split_k); - ggml_vk_buffer_read(&vk_prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); + ggml_vk_buffer_read(ctx, ctx->prealloc_split_k, 0, split_k_buf, sizeof(float) * d_ne * split_k); std::cerr << "d_buf0: " << std::endl << std::endl; ggml_vk_print_matrix_area(split_k_buf, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); @@ -3345,15 +3532,15 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size free(d_chk); - ggml_vk_queue_cleanup(vk_device.transfer_queue); - ggml_vk_queue_cleanup(vk_device.compute_queue); + ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue); ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_Y); ggml_vk_destroy_buffer(d_D); - ggml_vk_pipeline_cleanup(*p); - ggml_vk_pipeline_cleanup(vk_pipeline_matmul_split_k_reduce); + ggml_pipeline_cleanup(*p); + ggml_pipeline_cleanup(ctx->pipeline_matmul_split_k_reduce); free(x); free(y); @@ -3392,7 +3579,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1 } } -static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) { +static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_t ne1, size_t ne2, size_t ne3) { const size_t ne = ne0 * ne1 * ne2 * ne3; ggml_init_params iparams = { @@ -3406,7 +3593,7 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) ggml_tensor * tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne2, ne1, ne3); // NOLINT ggml_tensor * result_tensor = ggml_new_tensor_4d(ggml_ctx, GGML_TYPE_F32, ne0, ne1, ne2, ne3); - float * data = (float *) ggml_vk_host_malloc(ggml_nbytes(tensor)); + float * data = (float *) ggml_vk_host_malloc(ctx, ggml_nbytes(tensor)); tensor->data = data; float * result_data = (float *) malloc(ggml_nbytes(tensor)); @@ -3426,19 +3613,19 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) data[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; } - vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); - ggml_vk_ctx_begin(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ggml_vk_ctx_begin(ctx, subctx); - vk_buffer buffer = ggml_vk_create_buffer_check(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer buffer = ggml_vk_create_buffer_check(ctx, ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); - ggml_vk_h2d_tensor_2d(ctx, &buffer, 0, tensor, 0, 0, ggml_nrows(tensor)); + ggml_vk_h2d_tensor_2d(ctx, subctx, buffer, 0, tensor, 0, 0, ggml_nrows(tensor)); - ggml_vk_ctx_end(ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); - ggml_vk_buffer_read(&buffer, 0, result_data, ggml_nbytes(tensor)); + ggml_vk_buffer_read(ctx, buffer, 0, result_data, ggml_nbytes(tensor)); double avg_err = 0.0; int first_err_i0 = -1; @@ -3483,22 +3670,22 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3) ggml_vk_destroy_buffer(buffer); - ggml_vk_host_free(data); + ggml_vk_host_free(ctx, data); free(result_data); } -static void ggml_vk_test_transfer(size_t ne, bool pinned) { +static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool pinned) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl; #endif // Check transfers are correct - vk_buffer buffer = ggml_vk_create_buffer_check(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer buffer = ggml_vk_create_buffer_check(ctx, sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal); float * x; float * y; if (pinned) { - x = (float *) ggml_vk_host_malloc(sizeof(float) * ne); - y = (float *) ggml_vk_host_malloc(sizeof(float) * ne); + x = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne); + y = (float *) ggml_vk_host_malloc(ctx, sizeof(float) * ne); } else { x = (float *) malloc(sizeof(float) * ne); y = (float *) malloc(sizeof(float) * ne); @@ -3508,42 +3695,42 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) { x[i] = rand() / (float)RAND_MAX; } - vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); - ggml_vk_ctx_begin(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ggml_vk_ctx_begin(ctx, subctx); auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_buffer_write_async(ctx, &buffer, 0, x, sizeof(float) * ne); + ggml_vk_buffer_write_async(ctx, subctx, buffer, 0, x, sizeof(float) * ne); - for (auto& cpy : ctx->in_memcpys) { + for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ctx->in_memcpys.clear(); + subctx->in_memcpys.clear(); - ggml_vk_ctx_end(ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); double ms_to_gpu = std::chrono::duration_cast(end-begin).count() / 1000.0; - ggml_vk_ctx_begin(ctx); + ggml_vk_ctx_begin(ctx, subctx); begin = std::chrono::high_resolution_clock::now(); - ggml_vk_buffer_read_async(ctx, &buffer, 0, y, sizeof(float) * ne); + ggml_vk_buffer_read_async(ctx, subctx, buffer, 0, y, sizeof(float) * ne); - ggml_vk_ctx_end(ctx); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); - for (auto& cpy : ctx->out_memcpys) { + for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ctx->out_memcpys.clear(); + subctx->out_memcpys.clear(); end = std::chrono::high_resolution_clock::now(); @@ -3561,15 +3748,15 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) { ggml_vk_destroy_buffer(buffer); if (pinned) { - ggml_vk_host_free(x); - ggml_vk_host_free(y); + ggml_vk_host_free(ctx, x); + ggml_vk_host_free(ctx, y); } else { free(x); free(y); } } -static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { +static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_test_dequant(" << ne << ")" << std::endl; #endif @@ -3578,8 +3765,8 @@ static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { const size_t qx_sz = ne * ggml_type_size(quant)/ggml_blck_size(quant); float * x = (float *) malloc(x_sz); void * qx = malloc(qx_sz); - vk_buffer qx_buf = ggml_vk_create_buffer_check(qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); - vk_buffer x_buf = ggml_vk_create_buffer_check(x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer qx_buf = ggml_vk_create_buffer_check(ctx, qx_sz, vk::MemoryPropertyFlagBits::eDeviceLocal); + vk_buffer x_buf = ggml_vk_create_buffer_check(ctx, x_sz_f16, vk::MemoryPropertyFlagBits::eDeviceLocal); ggml_fp16_t * x_chk = (ggml_fp16_t *) malloc(x_sz_f16); for (size_t i = 0; i < ne; i++) { @@ -3588,7 +3775,7 @@ static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { std::vector hist_cur(1 << 4, 0); - vk_pipeline& p = vk_pipeline_dequant[quant]; + vk_pipeline& p = ctx->pipeline_dequant[quant]; switch(quant) { case GGML_TYPE_Q4_0: @@ -3625,27 +3812,26 @@ static void ggml_vk_test_dequant(size_t ne, ggml_type quant) { GGML_ASSERT(false); } - ggml_vk_pipeline_allocate_descriptor_sets(p, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, p, 1); - ggml_vk_buffer_write(&qx_buf, 0, qx, qx_sz); + ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz); - vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue); - ggml_vk_ctx_begin(ctx); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ggml_vk_ctx_begin(ctx, subctx); const std::vector pc = { 1, (int)ne, (int)ne, (int)ne }; - ggml_vk_sync_buffers(ctx); - ggml_vk_dispatch_pipeline(ctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); - ggml_vk_ctx_end(ctx); + ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); + ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); - ggml_vk_submit(ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(subctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); double ms_dequant = std::chrono::duration_cast(end-begin).count() / 1000.0; - ggml_vk_buffer_read(&x_buf, 0, x_chk, x_sz_f16); + ggml_vk_buffer_read(ctx, x_buf, 0, x_chk, x_sz_f16); double avg_err = 0.0; for (size_t i = 0; i < ne; i++) { @@ -3687,15 +3873,15 @@ static ggml_tensor * ggml_vk_find_last_use(const ggml_tensor * node, ggml_cgraph return nullptr; } -void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ +static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){ #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; + std::cerr << "ggml_ctx->preallocate_buffers_graph(" << node << ")" << std::endl; #endif const bool any_on_device = node->backend == GGML_BACKEND_GPU || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU)); - if (vk_disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) { + if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) { return; } @@ -3735,16 +3921,16 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ const uint32_t y_ne = ne10 * ne11; const uint32_t d_ne = ne20 * ne21; - const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; - const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; - uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, vk_device.properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; + const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; + const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; + const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; + const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; + uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0; - if (extra->buffer_gpu.size == 0) { + if (extra->buffer_gpu.expired()) { // Workaround for CPU backend BLAS matmul calls - extra->buffer_gpu = ggml_vk_create_buffer_temp(d_sz); + extra->buffer_gpu = ggml_vk_create_buffer_temp(ctx, d_sz); } switch (node->op) { @@ -3779,23 +3965,23 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ } break; case GGML_OP_MUL_MAT: - if (vk_prealloc_size_qx < qx_sz) { - vk_prealloc_size_qx = qx_sz; + if (ctx->prealloc_size_qx < qx_sz) { + ctx->prealloc_size_qx = qx_sz; } - if (vk_prealloc_size_qy < qy_sz) { - vk_prealloc_size_qy = qy_sz; + if (ctx->prealloc_size_qy < qy_sz) { + ctx->prealloc_size_qy = qy_sz; } - if (vk_prealloc_size_x < x_sz) { - vk_prealloc_size_x = x_sz; + if (ctx->prealloc_size_x < x_sz) { + ctx->prealloc_size_x = x_sz; } - if (vk_prealloc_size_y < y_sz) { - vk_prealloc_size_y = y_sz; + if (ctx->prealloc_size_y < y_sz) { + ctx->prealloc_size_y = y_sz; } - if (vk_prealloc_size_split_k < split_k_size) { - vk_prealloc_size_split_k = split_k_size; + if (ctx->prealloc_size_split_k < split_k_size) { + ctx->prealloc_size_split_k = split_k_size; } - if (vk_staging_size < x_sz + y_sz) { - vk_staging_size = x_sz + y_sz; + if (ctx->staging_size < x_sz + y_sz) { + ctx->staging_size = x_sz + y_sz; } break; default: @@ -3803,29 +3989,29 @@ void ggml_vk_preallocate_buffers_graph(ggml_tensor * node){ } } -void ggml_vk_preallocate_buffers() { - if (vk_disable) { +static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { + if (ctx->disable) { return; } #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_preallocate_buffers()" << std::endl; - std::cerr << "qx_size: " << vk_prealloc_size_qx << " qy_size: " << vk_prealloc_size_qy << " x_size: " << vk_prealloc_size_x << " y_size: " << vk_prealloc_size_y << " split_k_size: " << vk_prealloc_size_split_k << std::endl; + std::cerr << "ggml_ctx->preallocate_buffers()" << std::endl; + std::cerr << "qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << std::endl; #endif #if defined(GGML_VULKAN_RUN_TESTS) - vk_staging = ggml_vk_create_buffer_check(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); - ggml_vk_test_transfer(8192 * 1000, false); - ggml_vk_test_transfer(8192 * 1000, true); + ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ggml_vk_test_transfer(ctx, 8192 * 1000, false); + ggml_vk_test_transfer(ctx, 8192 * 1000, true); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_0); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_1); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_0); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_1); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q8_0); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q2_K); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q3_K); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q4_K); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q5_K); - ggml_vk_test_dequant(2560 * 7680, GGML_TYPE_Q6_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_0); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_1); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_0); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_1); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q8_0); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q2_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q3_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q4_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q5_K); + ggml_vk_test_dequant(ctx, 2560 * 7680, GGML_TYPE_Q6_K); const std::vector vals { 8, 8, 8, @@ -3852,76 +4038,76 @@ void ggml_vk_preallocate_buffers() { }; const size_t num_it = 1; for (size_t i = 0; i < vals.size(); i += 3) { - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); - ggml_vk_test_matmul(vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); std::cerr << std::endl; } GGML_ASSERT(false); #endif - if (vk_prealloc_size_qx > 0 && vk_prealloc_qx.size < vk_prealloc_size_qx) { + if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) { // Resize buffer - if (vk_prealloc_qx.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_qx); + if (ctx->prealloc_qx != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_qx); } - vk_prealloc_qx = ggml_vk_create_buffer_device(vk_prealloc_size_qx); + ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx); } - if (vk_prealloc_size_qy > 0 && vk_prealloc_qy.size < vk_prealloc_size_qy) { + if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) { // Resize buffer - if (vk_prealloc_qy.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_qy); + if (ctx->prealloc_qy != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_qy); } - vk_prealloc_qy = ggml_vk_create_buffer_device(vk_prealloc_size_qy); + ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy); } - if (vk_prealloc_size_x > 0 && vk_prealloc_x.size < vk_prealloc_size_x) { + if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { // Resize buffer - if (vk_prealloc_x.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_x); + if (ctx->prealloc_x != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_x); } - vk_prealloc_x = ggml_vk_create_buffer_device(vk_prealloc_size_x); + ctx->prealloc_x = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_x); } - if (vk_prealloc_size_y > 0 && vk_prealloc_y.size < vk_prealloc_size_y) { + if (ctx->prealloc_y == nullptr || (ctx->prealloc_size_y > 0 && ctx->prealloc_y->size < ctx->prealloc_size_y)) { // Resize buffer - if (vk_prealloc_y.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_y); + if (ctx->prealloc_y != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_y); } - vk_prealloc_y = ggml_vk_create_buffer_device(vk_prealloc_size_y); + ctx->prealloc_y = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_y); } - if (vk_prealloc_size_split_k > 0 && vk_prealloc_split_k.size < vk_prealloc_size_split_k) { + if (ctx->prealloc_split_k == nullptr || (ctx->prealloc_size_split_k > 0 && ctx->prealloc_split_k->size < ctx->prealloc_size_split_k)) { // Resize buffer - if (vk_prealloc_split_k.size > 0) { - ggml_vk_destroy_buffer(vk_prealloc_split_k); + if (ctx->prealloc_split_k != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_split_k); } - vk_prealloc_split_k = ggml_vk_create_buffer_device(vk_prealloc_size_split_k); + ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_split_k); } - if (vk_staging_size > 0 && vk_staging.size < vk_staging_size) { + if (ctx->staging == nullptr || (ctx->staging_size > 0 && ctx->staging->size < ctx->staging_size)) { // Resize buffer - if (vk_staging.size > 0) { - ggml_vk_destroy_buffer(vk_staging); + if (ctx->staging != nullptr) { + ggml_vk_destroy_buffer(ctx->staging); } - vk_staging = ggml_vk_create_buffer_check(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); + ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); } } -void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ +static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ const bool any_on_device = node->backend == GGML_BACKEND_GPU || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU); - if (vk_disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { + if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { return; } #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")" << std::endl; #endif - vk_semaphore_idx = 0; - vk_staging_offset = 0; + ctx->semaphore_idx = 0; + ctx->staging_offset = 0; const ggml_tensor * src0 = node->src[0]; const ggml_tensor * src1 = node->src[1]; @@ -3969,44 +4155,44 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ return; } - if (vk_ctx == nullptr) { - vk_ctx = ggml_vk_create_context(vk_device.compute_queue); - ggml_vk_ctx_begin(vk_ctx); + if (ctx->compute_ctx == nullptr) { + ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ggml_vk_ctx_begin(ctx, ctx->compute_ctx); } switch (node->op) { case GGML_OP_REPEAT: - ggml_vk_repeat(vk_ctx, src0, src1, node); + ggml_vk_repeat(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_GET_ROWS: - ggml_vk_get_rows(vk_ctx, src0, src1, node); + ggml_vk_get_rows(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_ADD: - ggml_vk_add(vk_ctx, src0, src1, node); + ggml_vk_add(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_MUL: - ggml_vk_mul(vk_ctx, src0, src1, node); + ggml_vk_mul(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_SCALE: - ggml_vk_scale(vk_ctx, src0, node); + ggml_vk_scale(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_SQR: - ggml_vk_sqr(vk_ctx, src0, node); + ggml_vk_sqr(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_CLAMP: - ggml_vk_clamp(vk_ctx, src0, node); + ggml_vk_clamp(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - ggml_vk_cpy(vk_ctx, src0, node); + ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_RESHAPE: @@ -4014,15 +4200,15 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NONE: - ggml_vk_nop(vk_ctx, src0, node); + ggml_vk_nop(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_NORM: - ggml_vk_norm(vk_ctx, src0, node); + ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_RMS_NORM: - ggml_vk_rms_norm(vk_ctx, src0, node); + ggml_vk_rms_norm(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_UNARY: @@ -4030,26 +4216,26 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_RELU: - ggml_vk_unary(vk_ctx, src0, node); + ggml_vk_unary(ctx, ctx->compute_ctx, src0, node); break; default: return; } break; case GGML_OP_DIAG_MASK_INF: - ggml_vk_diag_mask_inf(vk_ctx, src0, node); + ggml_vk_diag_mask_inf(ctx, ctx->compute_ctx, src0, node); break; case GGML_OP_SOFT_MAX: - ggml_vk_soft_max(vk_ctx, src0, src1, node); + ggml_vk_soft_max(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_ROPE: - ggml_vk_rope(vk_ctx, src0, src1, node); + ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node); break; case GGML_OP_MUL_MAT: - ggml_vk_mul_mat(vk_ctx, src0, src1, node); + ggml_vk_mul_mat(ctx, ctx->compute_ctx, src0, src1, node); break; default: @@ -4057,7 +4243,7 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ } extra->ready = true; - extra->ctx_idx = vk_ctx->idx; + extra->ctx_idx = ctx->compute_ctx->idx; #ifdef GGML_VULKAN_CHECK_RESULTS // Force context reset on each node so that each tensor ends up in its own context @@ -4066,18 +4252,18 @@ void ggml_vk_build_graph(ggml_tensor * node, bool last_node){ #endif if (node->backend == GGML_BACKEND_CPU || last_node) { - ggml_vk_ctx_end(vk_ctx); - vk_ctx->exit_tensor = node; - vk_ctx = nullptr; + ggml_vk_ctx_end(ctx->compute_ctx); + ctx->compute_ctx->exit_tensor = node; + ctx->compute_ctx = nullptr; } } -bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ const bool any_on_device = tensor->backend == GGML_BACKEND_GPU || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); - if (vk_disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) { + if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) { return false; } @@ -4145,33 +4331,33 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor) #endif #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(params, tensor); + ggml_vk_check_results_0(ctx, params, tensor); #endif GGML_ASSERT(extra->ready); - vk_context& ctx = vk_gc.contexts[extra->ctx_idx]; + vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; // Only run if ctx hasn't been submitted yet - if (!ctx.seqs.empty()) { + if (!subctx.seqs.empty()) { // Do staging buffer copies - for (auto& cpy : ctx.in_memcpys) { + for (auto& cpy : subctx.in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(&ctx, vk_fence); + ggml_vk_submit(&subctx, ctx->fence); } - if (tensor == ctx.exit_tensor) { - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - vk_device.device.resetFences({ vk_fence }); + if (tensor == subctx.exit_tensor) { + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); // Do staging buffer copies - for (auto& cpy : ctx.out_memcpys) { + for (auto& cpy : subctx.out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ctx.in_memcpys.clear(); - ctx.out_memcpys.clear(); + subctx.in_memcpys.clear(); + subctx.out_memcpys.clear(); } extra->ready = false; @@ -4179,90 +4365,204 @@ bool ggml_vk_compute_forward(ggml_compute_params * params, ggml_tensor * tensor) return true; } -void ggml_vk_graph_cleanup() { - if (vk_disable) { +// Clean up after graph processing is done +static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { + if (ctx->disable) { return; } #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_graph_cleanup()" << std::endl; #endif - for (auto& buffer : vk_gc.temp_buffers) { - ggml_vk_pool_free(buffer); + for (auto& buffer : ctx->gc.temp_buffers) { + ggml_vk_pool_free(ctx, buffer); } - vk_gc.temp_buffers.clear(); + ctx->gc.temp_buffers.clear(); - for (auto * pipeline : vk_gc.pipelines) { - ggml_vk_pipeline_cleanup(*pipeline); - } - vk_gc.pipelines.clear(); - - ggml_vk_queue_cleanup(vk_device.compute_queue); - ggml_vk_queue_cleanup(vk_device.transfer_queue); - - for (size_t i = 0; i < vk_gc.semaphores.size(); i++) { - vk_device.device.destroySemaphore({ vk_gc.semaphores[i].s }); - } - vk_gc.semaphores.clear(); - - for (size_t i = 0; i < vk_gc.tl_semaphores.size(); i++) { - vk_device.device.destroySemaphore({ vk_gc.tl_semaphores[i].s }); - } - vk_gc.tl_semaphores.clear(); - - vk_event_idx = 0; - - for (auto& event : vk_gc.events) { - vk_device.device.resetEvent(event); + for (auto * pipeline : ctx->gc.pipelines) { + ggml_pipeline_cleanup(*pipeline); } - vk_staging_offset = 0; + ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue); + ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue); - vk_ctx = nullptr; - vk_gc.contexts.clear(); + for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { + ctx->device.lock()->device.destroySemaphore({ ctx->gc.semaphores[i].s }); + } + ctx->gc.semaphores.clear(); + + for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) { + ctx->device.lock()->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s }); + } + ctx->gc.tl_semaphores.clear(); + ctx->semaphore_idx = 0; + + ctx->event_idx = 0; + + for (auto& event : ctx->gc.events) { + ctx->device.lock()->device.resetEvent(event); + } + + ctx->staging_offset = 0; + + ctx->compute_ctx = nullptr; + ctx->transfer_ctx = nullptr; + ctx->gc.contexts.clear(); } -static void ggml_vk_cleanup() { +// Clean up on backend free +static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_cleanup()" << std::endl; + std::cerr << "ggml_vk_cleanup(" << ctx->idx << ")" << std::endl; #endif - ggml_vk_destroy_buffer(vk_prealloc_x); - ggml_vk_destroy_buffer(vk_prealloc_y); - ggml_vk_destroy_buffer(vk_prealloc_split_k); - ggml_vk_destroy_buffer(vk_staging); - ggml_vk_destroy_buffer(vk_sync_staging); + ggml_vk_graph_cleanup(ctx); - vk_prealloc_size_x = 0; - vk_prealloc_size_y = 0; - vk_prealloc_size_split_k = 0; - vk_staging_size = 0; + ggml_vk_destroy_buffer(ctx->prealloc_qx); + ggml_vk_destroy_buffer(ctx->prealloc_qy); + ggml_vk_destroy_buffer(ctx->prealloc_x); + ggml_vk_destroy_buffer(ctx->prealloc_y); + ggml_vk_destroy_buffer(ctx->prealloc_split_k); + ggml_vk_destroy_buffer(ctx->staging); + ggml_vk_destroy_buffer(ctx->sync_staging); - for (auto& event : vk_gc.events) { - vk_device.device.destroyEvent(event); + for (auto& buffer : ctx->buffer_pool) { + ggml_vk_destroy_buffer(buffer); } - vk_gc.events.clear(); + + ctx->prealloc_size_qx = 0; + ctx->prealloc_size_qy = 0; + ctx->prealloc_size_x = 0; + ctx->prealloc_size_y = 0; + ctx->prealloc_size_split_k = 0; + ctx->staging_size = 0; + + for (auto& event : ctx->gc.events) { + ctx->device.lock()->device.destroyEvent(event); + } + ctx->gc.events.clear(); + + for (auto* pipeline : ctx->gc.pipelines) { + ggml_vk_destroy_pipeline(ctx, pipeline); + } + ctx->gc.pipelines.clear(); + + ctx->device.lock()->device.destroyFence(ctx->fence); + + ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->compute_queue.pool); + if (!ctx->device.lock()->single_queue) { + ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->transfer_queue.pool); + } +} + +GGML_CALL int ggml_vk_get_device_count() { + ggml_vk_instance_init(); + + return vk_instance.device_indices.size(); +} + +GGML_CALL void ggml_vk_get_device_description(int device, char * description, size_t description_size) { + ggml_vk_instance_init(); + + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + vk::PhysicalDeviceProperties props; + devices[device].getProperties(&props); + + snprintf(description, description_size, "%s", props.deviceName.data()); +} + +// CPU assist interface + +void ggml_vk_init_cpu_assist() { + ggml_vk_instance_init(); + + std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl; + + for (size_t i = 0; i < ggml_vk_get_device_count(); i++) { + ggml_vk_print_gpu_info(i); + } + // Initialize the first backend to make sure CPU matrix multiplications can be offloaded. + ggml_backend_vk_init(0); +} + +void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return; + } + + ggml_vk_preallocate_buffers_graph(ctx, node); +} + +void ggml_vk_preallocate_buffers_cpu_assist() { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return; + } + + ggml_vk_preallocate_buffers(ctx); +} + +void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return; + } + + ggml_vk_build_graph(ctx, node, last_node); +} + +bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){ + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return false; + } + + return ggml_vk_compute_forward(ctx, params, tensor); +} + +void ggml_vk_graph_cleanup_cpu_assist() { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized) { + return; + } + + ggml_vk_graph_cleanup(ctx); +} + +void ggml_vk_free_cpu_assist() { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + if (!ctx->initialized || vk_instance.backends[0] == nullptr) { + return; + } + + ggml_backend_vk_free(vk_instance.backends[0]); } // backend interface #define UNUSED GGML_UNUSED -struct ggml_backend_vk_context { - std::string name; -}; - // device backend static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT struct ggml_backend_vk_buffer_context { + ggml_backend_vk_context * ctx; vk_buffer dev_buffer; ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; size_t temp_tensor_extra_index = 0; std::string name; - ggml_backend_vk_buffer_context(vk_buffer dev_buffer) : + ggml_backend_vk_buffer_context(ggml_backend_vk_context * ctx, vk_buffer&& dev_buffer, std::string& name) : + ctx(ctx), dev_buffer(dev_buffer), - name(GGML_VK_NAME) { + name(name) { } ~ggml_backend_vk_buffer_context() { @@ -4294,6 +4594,9 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) { } GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_buffer_free_buffer()" << std::endl; +#endif ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_vk_destroy_buffer(ctx->dev_buffer); delete ctx; @@ -4313,6 +4616,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra(); if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra; extra->buffer_gpu = extra_view->buffer_gpu; extra->offset = extra_view->offset + tensor->view_offs; @@ -4331,11 +4635,13 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu #endif GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_vk_buffer_write(&extra->buffer_gpu, extra->offset + offset, data, size); + vk_buffer buf = extra->buffer_gpu.lock(); - UNUSED(buffer); + ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size); } GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -4344,31 +4650,35 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu #endif GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset + offset, data, size); + vk_buffer buf = extra->buffer_gpu.lock(); - UNUSED(buffer); + ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size); } GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { if (ggml_backend_buffer_is_vk(src->buffer)) { + ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - ggml_vk_buffer_copy(&src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src)); + vk_buffer src_buf = src_extra->buffer_gpu.lock(); + vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); + + ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); return true; } return false; - - UNUSED(buffer); } GGML_CALL static void ggml_backend_vk_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; - ggml_vk_buffer_memset(&ctx->dev_buffer, 0, value, buffer->size); + ggml_vk_buffer_memset(ctx->ctx, ctx->dev_buffer, 0, value, buffer->size); } static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { @@ -4386,6 +4696,7 @@ static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = { // vk buffer type struct ggml_backend_vk_buffer_type_context { std::string name; + ggml_backend_vk_context * ctx; }; GGML_CALL static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft) { @@ -4398,25 +4709,22 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer( #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl; #endif - vk_buffer dev_buffer = ggml_vk_create_buffer_device(size); + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size); - ggml_backend_vk_buffer_context * ctx = new ggml_backend_vk_buffer_context(dev_buffer); + ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name); - return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, ctx, size); - - UNUSED(buft); + return ggml_backend_buffer_init(buft, ggml_backend_vk_buffer_interface, bufctx, size); } GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return vk_device.properties.limits.minStorageBufferOffsetAlignment; - - UNUSED(buft); + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + return ctx->ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; } GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - return vk_device.max_memory_allocation_size; - - UNUSED(buft); + ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; + return ctx->ctx->device.lock()->max_memory_allocation_size; } GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { @@ -4426,9 +4734,14 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_ } GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_vk(backend); + if (!ggml_backend_is_vk(backend)) { + return false; + } - UNUSED(buft); + ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + return buft_ctx->ctx->idx == ctx->idx; } static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { @@ -4441,20 +4754,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .is_host = */ NULL, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type() { - static ggml_backend_buffer_type ggml_backend_vk_buffer_type; +GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t idx) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_buffer_type(" << idx << ")" << std::endl; +#endif - static bool ggml_backend_vk_buffer_type_initialized = false; + GGML_ASSERT(idx < vk_instance.device_indices.size()); - if (!ggml_backend_vk_buffer_type_initialized) { - ggml_backend_vk_buffer_type = { - /* .iface = */ ggml_backend_vk_buffer_type_interface, - /* .context = */ new ggml_backend_vk_buffer_type_context{GGML_VK_NAME}, - }; - ggml_backend_vk_buffer_type_initialized = true; - } + ggml_backend_vk_init(idx); - return &ggml_backend_vk_buffer_type; + return &vk_instance.buffer_types[idx]; } // host buffer type @@ -4472,13 +4781,19 @@ GGML_CALL static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buff } GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_vk_host_free(buffer->context); +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_host_buffer_free_buffer()" << std::endl; +#endif + ggml_vk_host_free(&vk_instance.contexts[0], buffer->context); } GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl; +#endif void * ptr = nullptr; try { - ptr = ggml_vk_host_malloc(size); + ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size); } catch (vk::SystemError& e) { std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl; std::cerr << "ggml_vulkan: " << e.what() << std::endl; @@ -4495,7 +4810,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu } GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return vk_device.properties.limits.minMemoryMapAlignment; + return vk_instance.contexts[0].device.lock()->properties.limits.minMemoryMapAlignment; UNUSED(buft); } @@ -4514,127 +4829,150 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .context = */ nullptr, }; + if (!vk_instance.contexts[0].initialized) { + // Fall back to CPU + return ggml_backend_cpu_buffer_type(); + } + return &ggml_backend_vk_buffer_type_host; } // backend GGML_CALL static const char * ggml_backend_vk_name(ggml_backend_t backend) { - ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - return vk_ctx->name.c_str(); + return ctx->name.c_str(); } GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { - ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_free(" << ctx->name << ")" << std::endl; +#endif - delete vk_ctx; + size_t idx = ctx->idx; + + ggml_vk_cleanup(ctx); + + // Release device + vk_instance.devices[ctx->idx].reset(); + ctx->initialized = false; + + vk_instance.initialized[idx] = false; + vk_instance.backends[idx] = nullptr; + memset(&vk_instance.buffer_types[idx], 0, sizeof(ggml_backend_buffer_type)); delete backend; } GGML_CALL static ggml_backend_buffer_type_t ggml_backend_vk_get_default_buffer_type(ggml_backend_t backend) { - return ggml_backend_vk_buffer_type(); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - UNUSED(backend); + GGML_ASSERT(ctx->initialized); + + return ggml_backend_vk_buffer_type(ctx->idx); } GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl; #endif - GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (vk_transfer_ctx == nullptr) { + if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_transfer_ctx); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } - ggml_vk_buffer_write_async(vk_transfer_ctx, &extra->buffer_gpu, extra->offset + offset, data, size); + vk_buffer buf = extra->buffer_gpu.lock(); - UNUSED(backend); + ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); } GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl; #endif - GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (vk_transfer_ctx == nullptr) { + if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_transfer_ctx); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } - ggml_vk_buffer_read_async(vk_transfer_ctx, &extra->buffer_gpu, extra->offset + offset, data, size); + vk_buffer buf = extra->buffer_gpu.lock(); - UNUSED(backend); + ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size); } GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl; #endif - if ((dst->buffer->buft == ggml_backend_vk_buffer_type() || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + if ((dst->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - if (vk_transfer_ctx == nullptr) { + if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - vk_transfer_ctx = ggml_vk_create_context(vk_device.transfer_queue); - ggml_vk_ctx_begin(vk_transfer_ctx); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } - ggml_vk_buffer_copy_async(vk_transfer_ctx, &src_extra->buffer_gpu, src_extra->offset, &dst_extra->buffer_gpu, dst_extra->offset, ggml_nbytes(src)); + vk_buffer src_buf = src_extra->buffer_gpu.lock(); + vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); + + ggml_vk_buffer_copy_async(ctx->transfer_ctx, src_buf, src_extra->offset, dst_buf, dst_extra->offset, ggml_nbytes(src)); return true; } return false; - - UNUSED(backend); } GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_synchronize()" << std::endl; #endif - if(vk_transfer_ctx == nullptr) { + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + if(ctx->transfer_ctx == nullptr) { return; } - ggml_vk_ctx_end(vk_transfer_ctx); + ggml_vk_ctx_end(ctx->transfer_ctx); - for (auto& cpy : vk_transfer_ctx->in_memcpys) { + for (auto& cpy : ctx->transfer_ctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_submit(vk_transfer_ctx, vk_fence); - VK_CHECK(vk_device.device.waitForFences({ vk_fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); - vk_device.device.resetFences({ vk_fence }); + ggml_vk_submit(ctx->transfer_ctx, ctx->fence); + VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); + ctx->device.lock()->device.resetFences({ ctx->fence }); - for (auto& cpy : vk_transfer_ctx->out_memcpys) { + for (auto& cpy : ctx->transfer_ctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - vk_transfer_ctx = nullptr; - - UNUSED(backend); + ctx->transfer_ctx = nullptr; } GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - // ggml_backend_vk_context * vk_ctx = (ggml_backend_vk_context *)backend->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]); + ggml_vk_preallocate_buffers_graph(ctx, cgraph->nodes[i]); } - ggml_vk_preallocate_buffers(); + ggml_vk_preallocate_buffers(ctx); int last_node = cgraph->n_nodes - 1; @@ -4644,7 +4982,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml } for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(cgraph->nodes[i], i == last_node); + ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node); } ggml_compute_params params = {}; @@ -4657,19 +4995,19 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml continue; } - bool ok = ggml_vk_compute_forward(¶ms, node); + bool ok = ggml_vk_compute_forward(ctx, ¶ms, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } #ifdef GGML_VULKAN_CHECK_RESULTS else { - ggml_vk_check_results_1(¶ms, node); + ggml_vk_check_results_1(ctx, ¶ms, node); } #endif GGML_ASSERT(ok); } - ggml_vk_graph_cleanup(); + ggml_vk_graph_cleanup(ctx); return true; @@ -4734,7 +5072,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const } return false; } break; - // case GGML_OP_DUP: + case GGML_OP_DUP: // case GGML_OP_REPEAT: // { // ggml_type src0_type = op->src[0]->type; @@ -4786,18 +5124,30 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .supports_op = */ ggml_backend_vk_supports_op, }; -GGML_CALL ggml_backend_t ggml_backend_vk_init() { - ggml_vk_init(); // TODO: remove from ggml.c +GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) { + if (vk_instance.initialized[idx]) { + return vk_instance.backends[idx]; + } +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_init(" << idx << ")" << std::endl; +#endif - ggml_backend_vk_context * ctx = new ggml_backend_vk_context { - /* .name = */ GGML_VK_NAME, + ggml_backend_vk_context * ctx = &vk_instance.contexts[idx]; + ggml_vk_init(ctx, idx); + ctx->name = GGML_VK_NAME + std::to_string(idx); + vk_instance.buffer_types[idx] = { + /* .iface = */ ggml_backend_vk_buffer_type_interface, + /* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx }, }; + vk_instance.initialized[idx] = true; ggml_backend_t vk_backend = new ggml_backend { /* .interface = */ ggml_backend_vk_interface, - /* .context = */ ctx + /* .context = */ &vk_instance.contexts[ctx->idx], }; + vk_instance.backends[idx] = vk_backend; + return vk_backend; } @@ -4805,20 +5155,47 @@ GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend) { return backend && backend->iface.get_name == ggml_backend_vk_name; } +GGML_CALL int ggml_backend_vk_get_device_count() { + return ggml_vk_get_device_count(); +} + +GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { + ggml_vk_get_device_description(device, description, description_size); +} + +GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { + GGML_ASSERT(device < vk_instance.device_indices.size()); + + vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; + + vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); + + for (const vk::MemoryHeap& heap : memprops.memoryHeaps) { + if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + *total = heap.size; + *free = heap.size; + break; + } + } +} + // backend registry GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, void * user_data) { - ggml_backend_t vk_backend = ggml_backend_vk_init(); + ggml_backend_t vk_backend = ggml_backend_vk_init((int) (intptr_t) user_data); return vk_backend; UNUSED(params); - UNUSED(user_data); } extern "C" GGML_CALL int ggml_backend_vk_reg_devices(); GGML_CALL int ggml_backend_vk_reg_devices() { - ggml_backend_register(GGML_VK_NAME, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(), nullptr); - return 1; + for (auto idx : vk_instance.device_indices) { + char name[128]; + snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, idx); + ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(idx), (void *) (intptr_t) idx); + } + return vk_instance.device_indices.size(); } // checks @@ -4874,7 +5251,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d } } -static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) { +static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; if (tensor->backend == GGML_BACKEND_GPU) { @@ -4883,7 +5260,7 @@ static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, extra->offset, tensor_data, tensor_size); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -4944,7 +5321,7 @@ void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; -static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) { +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) { if (params->ith != 0) { return; } @@ -4966,7 +5343,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * /*.no_alloc =*/ false, }; - struct ggml_context * ctx = ggml_init(iparams); + struct ggml_context * ggml_ctx = ggml_init(iparams); struct ggml_tensor * src0_clone = nullptr; struct ggml_tensor * src1_clone = nullptr; @@ -4979,7 +5356,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * void * src1_buffer; if (src0 != nullptr) { - src0_clone = ggml_dup_tensor(ctx, src0); + src0_clone = ggml_dup_tensor(ggml_ctx, src0); src0_size = ggml_nbytes(src0); @@ -4995,7 +5372,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) { const int idx = i3*src0->ne[2] + i2; - ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]); } } @@ -5005,10 +5382,10 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * src0_clone->nb[i] = src0_clone->nb[i - 1]*src0_clone->ne[i - 1]; } } else { - if (offset + src0_size >= extra->buffer_gpu.size) { - src0_size = extra->buffer_gpu.size - offset; + if (offset + src0_size >= extra->buffer_gpu->size) { + src0_size = extra->buffer_gpu->size - offset; } - ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset, src0_clone->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -5016,13 +5393,13 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(src0, "src0"); + ggml_vk_print_tensor(ctx, src0, "src0"); } ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone); } if (src1 != nullptr) { - src1_clone = ggml_dup_tensor(ctx, src1); + src1_clone = ggml_dup_tensor(ggml_ctx, src1); src1_size = ggml_nbytes(src1); @@ -5038,7 +5415,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) { const int idx = i3*src1->ne[2] + i2; - ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]); } } @@ -5048,10 +5425,10 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * src1_clone->nb[i] = src1_clone->nb[i - 1]*src1_clone->ne[i - 1]; } } else { - if (offset + src1_size >= extra->buffer_gpu.size) { - src1_size = extra->buffer_gpu.size - offset; + if (offset + src1_size >= extra->buffer_gpu->size) { + src1_size = extra->buffer_gpu->size - offset; } - ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset, src1_clone->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -5059,7 +5436,7 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(src1, "src1"); + ggml_vk_print_tensor(ctx, src1, "src1"); std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl; std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl; if (src1->src[0] != nullptr) { @@ -5082,51 +5459,51 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * } if (tensor->op == GGML_OP_MUL_MAT) { - tensor_clone = ggml_mul_mat(ctx, src0_clone, src1_clone); + tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_MUL) { - tensor_clone = ggml_mul(ctx, src0_clone, src1_clone); + tensor_clone = ggml_mul(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_SCALE) { - tensor_clone = ggml_scale(ctx, src0_clone, ((float *)tensor->op_params)[0]); + tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]); } else if (tensor->op == GGML_OP_SQR) { - tensor_clone = ggml_sqr(ctx, src0_clone); + tensor_clone = ggml_sqr(ggml_ctx, src0_clone); } else if (tensor->op == GGML_OP_CLAMP) { - tensor_clone = ggml_clamp(ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); + tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); } else if (tensor->op == GGML_OP_ADD) { - tensor_clone = ggml_add(ctx, src0_clone, src1_clone); + tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_NORM) { - tensor_clone = ggml_norm(ctx, src0_clone, *(float *)tensor->op_params); + tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_RMS_NORM) { - tensor_clone = ggml_rms_norm(ctx, src0_clone, *(float *)tensor->op_params); + tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_SOFT_MAX) { if (src1 != nullptr) { - tensor_clone = ggml_soft_max_ext(ctx, src0_clone, src1_clone, *(float *)tensor->op_params); + tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, *(float *)tensor->op_params); } else { - tensor_clone = ggml_soft_max(ctx, src0_clone); + tensor_clone = ggml_soft_max(ggml_ctx, src0_clone); } } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { - tensor_clone = ggml_diag_mask_inf(ctx, src0_clone, *(float *)tensor->op_params); + tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_ROPE) { const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; - const int n_ctx = ((int32_t *) tensor->op_params)[3]; - const int n_orig_ctx = ((int32_t *) tensor->op_params)[4]; + const int n_ggml_ctx = ((int32_t *) tensor->op_params)[3]; + const int n_orig_ggml_ctx = ((int32_t *) tensor->op_params)[4]; float freq_base = ((float *) tensor->op_params)[5]; float freq_scale = ((float *) tensor->op_params)[6]; float ext_factor = ((float *) tensor->op_params)[7]; float attn_factor = ((float *) tensor->op_params)[8]; float beta_fast = ((float *) tensor->op_params)[9]; float beta_slow = ((float *) tensor->op_params)[10]; - tensor_clone = ggml_rope_custom(ctx, src0_clone, src1_clone, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else if (tensor->op == GGML_OP_UNARY) { switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_SILU: - tensor_clone = ggml_silu(ctx, src0_clone); + tensor_clone = ggml_silu(ggml_ctx, src0_clone); break; case GGML_UNARY_OP_GELU: - tensor_clone = ggml_gelu(ctx, src0_clone); + tensor_clone = ggml_gelu(ggml_ctx, src0_clone); break; case GGML_UNARY_OP_RELU: - tensor_clone = ggml_relu(ctx, src0_clone); + tensor_clone = ggml_relu(ggml_ctx, src0_clone); break; default: std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; @@ -5134,40 +5511,40 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * } } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { - tensor_clone = ggml_dup(ctx, src0_clone); + tensor_clone = ggml_dup(ggml_ctx, src0_clone); tensor_clone->type = tensor->type; } else { - tensor_clone = ggml_cpy(ctx, src0_clone, src1_clone); + tensor_clone = ggml_cpy(ggml_ctx, src0_clone, src1_clone); } } else if (tensor->op == GGML_OP_CONT) { - tensor_clone = ggml_cont_4d(ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + tensor_clone = ggml_cont_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); } else if (tensor->op == GGML_OP_RESHAPE) { - tensor_clone = ggml_reshape_4d(ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + tensor_clone = ggml_reshape_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); } else if (tensor->op == GGML_OP_VIEW) { - tensor_clone = ggml_view_4d(ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); + tensor_clone = ggml_view_4d(ggml_ctx, src0_clone, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]); } else if (tensor->op == GGML_OP_PERMUTE) { int32_t * params = (int32_t *)tensor->op_params; - tensor_clone = ggml_permute(ctx, src0_clone, params[0], params[1], params[2], params[3]); + tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]); } else if (tensor->op == GGML_OP_TRANSPOSE) { - tensor_clone = ggml_transpose(ctx, src0_clone); + tensor_clone = ggml_transpose(ggml_ctx, src0_clone); } else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ASSERT(false); } // Disable vulkan here to avoid the hooks in ggml.c - vk_disable = true; + ctx->disable = true; - ggml_cgraph * cgraph = ggml_new_graph(ctx); + ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); ggml_build_forward_expand(cgraph, tensor_clone); - ggml_graph_compute_with_ctx(ctx, cgraph, 8); + ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8); - vk_disable = false; + ctx->disable = false; ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone); if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { - ggml_vk_print_tensor(tensor_clone, "tensor_clone"); + ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone"); } comp_size = ggml_nbytes(tensor_clone); @@ -5183,10 +5560,10 @@ static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * free(src1_buffer); } - ggml_free(ctx); + ggml_free(ggml_ctx); } -void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) { +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) { if (params->ith != 0) { return; } @@ -5208,11 +5585,11 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (extra->offset + tensor_size >= extra->buffer_gpu.size) { - tensor_size = extra->buffer_gpu.size - (extra->offset); + if (extra->offset + tensor_size >= extra->buffer_gpu->size) { + tensor_size = extra->buffer_gpu->size - (extra->offset); } - ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size); + ggml_vk_buffer_read(ctx, extra->buffer_gpu, extra->offset, tensor_data, tensor_size); } float first_error_result = -1.0f; @@ -5339,4 +5716,10 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor) free(tensor_data); } } + +void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) { + ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; + + ggml_vk_check_results_0(ctx, params, tensor); +} #endif diff --git a/ggml-vulkan.h b/ggml-vulkan.h index eb8a148e2..9645126b4 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -8,24 +8,29 @@ extern "C" { #endif #define GGML_VK_NAME "Vulkan" +#define GGML_VK_MAX_DEVICES 16 -GGML_API void ggml_vk_init(void); +GGML_API void ggml_vk_init_cpu_assist(void); -GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node); -GGML_API void ggml_vk_preallocate_buffers(void); -GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node); -GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node); +GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void); +GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node); +GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor); #ifdef GGML_VULKAN_CHECK_RESULTS -void ggml_vk_check_results_1(struct ggml_compute_params * params, struct ggml_tensor * tensor); +void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor); #endif -GGML_API void ggml_vk_graph_cleanup(void); +GGML_API void ggml_vk_graph_cleanup_cpu_assist(void); +GGML_API void ggml_vk_free_cpu_assist(void); // backend API -GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void); +GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num); GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend); +GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void); +GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); +GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); -GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); diff --git a/ggml.c b/ggml.c index b9ec0c981..f783a6fd3 100644 --- a/ggml.c +++ b/ggml.c @@ -2343,7 +2343,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { #elif defined(GGML_USE_CLBLAST) ggml_cl_init(); #elif defined(GGML_USE_VULKAN) - ggml_vk_init(); + ggml_vk_init_cpu_assist(); #elif defined(GGML_USE_SYCL) ggml_init_sycl(); #endif @@ -14850,10 +14850,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); #elif defined(GGML_USE_VULKAN) - const bool skip_cpu = ggml_vk_compute_forward(params, tensor); + const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); #ifdef GGML_VULKAN_CHECK_RESULTS if (skip_cpu) { - ggml_vk_check_results_1(params, tensor); + ggml_vk_check_results_1_cpu_assist(params, tensor); } #endif if (skip_cpu) { @@ -17269,12 +17269,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { #ifdef GGML_USE_VULKAN for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]); + ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]); } - ggml_vk_preallocate_buffers(); + ggml_vk_preallocate_buffers_cpu_assist(); for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1); + ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1); } #endif @@ -17330,7 +17330,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { } #ifdef GGML_USE_VULKAN - ggml_vk_graph_cleanup(); + ggml_vk_graph_cleanup_cpu_assist(); #endif // performance stats (graph) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ed8e26f83..1cfd41c0b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -104,6 +104,7 @@ class MODEL_ARCH(IntEnum): CODESHELL = auto() ORION = auto() INTERNLM2 = auto() + MINICPM = auto() class MODEL_TENSOR(IntEnum): @@ -156,6 +157,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.ORION: "orion", MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -464,6 +466,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.MINICPM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], # TODO } diff --git a/llama.cpp b/llama.cpp index 65e399adc..c45ae1d50 100644 --- a/llama.cpp +++ b/llama.cpp @@ -205,6 +205,7 @@ enum llm_arch { LLM_ARCH_CODESHELL, LLM_ARCH_ORION, LLM_ARCH_INTERNLM2, + LLM_ARCH_MINICPM, LLM_ARCH_UNKNOWN, }; @@ -228,6 +229,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, }; enum llm_kv { @@ -690,6 +692,29 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_MINICPM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1330,7 +1355,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { #elif defined(GGML_USE_CUBLAS) buft = ggml_backend_cuda_buffer_type(gpu); #elif defined(GGML_USE_VULKAN) - buft = ggml_backend_vk_buffer_type(); + buft = ggml_backend_vk_buffer_type(gpu); #elif defined(GGML_USE_SYCL) buft = ggml_backend_sycl_buffer_type(gpu); #elif defined(GGML_USE_CLBLAST) @@ -1367,6 +1392,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g GGML_UNUSED(tensor_split); } +static size_t llama_get_device_count() { +#if defined(GGML_USE_CUBLAS) + return ggml_backend_cuda_get_device_count(); +#elif defined(GGML_USE_VULKAN) + return ggml_backend_vk_get_device_count(); +#else + return 1; +#endif +} + +static size_t llama_get_device_memory(int device) { +#if defined(GGML_USE_CUBLAS) + size_t total; + size_t free; + ggml_backend_cuda_get_device_memory(device, &total, &free); + return free; +#elif defined(GGML_USE_VULKAN) + size_t total; + size_t free; + ggml_backend_vk_get_device_memory(device, &total, &free); + return free; +#else + return 1; + GGML_UNUSED(device); +#endif +} + // // globals // @@ -1390,6 +1442,7 @@ enum e_model { MODEL_UNKNOWN, MODEL_0_5B, MODEL_1B, + MODEL_2B, MODEL_3B, MODEL_4B, MODEL_7B, @@ -1737,6 +1790,10 @@ struct llama_context { ggml_backend_free(backend); } +#ifdef GGML_USE_VULKAN + ggml_vk_free_cpu_assist(); +#endif + ggml_backend_buffer_free(buf_input); ggml_free(ctx_input); } @@ -2748,6 +2805,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { static const char * llama_model_type_name(e_model type) { switch (type) { case MODEL_1B: return "1B"; + case MODEL_2B: return "2B"; case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; @@ -2887,6 +2945,13 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MINICPM: + { + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_2B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_FALCON: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3402,22 +3467,18 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } -#ifdef GGML_USE_CUBLAS if (split_mode == LLAMA_SPLIT_LAYER) { // calculate the split points - int device_count = ggml_backend_cuda_get_device_count(); + int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); - float splits[GGML_CUDA_MAX_DEVICES]; + std::vector splits(device_count); if (all_zero) { // default split, by free memory for (int i = 0; i < device_count; ++i) { - size_t total; - size_t free; - ggml_backend_cuda_get_device_memory(i, &total, &free); - splits[i] = free; + splits[i] = llama_get_device_memory(i); } } else { - std::copy(tensor_split, tensor_split + device_count, splits); + std::copy(tensor_split, tensor_split + device_count, splits.begin()); } // sum and normalize the splits to get the split points @@ -3433,19 +3494,17 @@ static bool llm_load_tensors( // assign the repeating layers to the devices according to the splits int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int64_t i = i_gpu_start; i < n_layer; ++i) { - int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; + int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin(); model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); } // assign the output layer if (n_gpu_layers > n_layer) { - int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits; + int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin(); model.buft_output = llama_default_buffer_type_offload(layer_gpu); } else { model.buft_output = llama_default_buffer_type_cpu(true); } - } else -#endif - { + } else { ggml_backend_buffer_type_t split_buft; if (split_mode == LLAMA_SPLIT_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); @@ -3524,13 +3583,16 @@ static bool llm_load_tensors( switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: + case LLM_ARCH_MINICPM: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + if (model.arch != LLM_ARCH_MINICPM){ + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } } for (int i = 0; i < n_layer; ++i) { @@ -6781,6 +6843,153 @@ struct llm_build_context { return gf; } + // ref: https://arxiv.org/abs/2203.03466 + // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 + // based on the original build_llama() function + struct ggml_cgraph * build_minicpm() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const int64_t n_embd = hparams.n_embd; + //TODO: if the model varies, these parameters need to be read from the model + const int64_t n_embd_base = 256; + const float scale_embd = 12.0f; + const float scale_depth = 1.4f; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // scale the input embeddings + inpL = ggml_scale(ctx0, inpL, scale_embd); + cb(inpL, "inp_scaled", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + // scale_res - scale the hidden states for residual connection + const float scale_res = scale_depth/sqrtf(float(n_layer)); + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled", -1); + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // scale the hidden states for residual connection + cur = ggml_scale(ctx0, cur, scale_res); + cb(cur, "hidden_scaled_ffn", -1); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head scaling + const float scale_lmhead = float(n_embd_base)/float(n_embd); + cur = ggml_scale(ctx0, cur, scale_lmhead); + cb(cur, "lmhead_scaling", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph( @@ -6943,6 +7152,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_internlm2(); } break; + case LLM_ARCH_MINICPM: + { + result = llm.build_minicpm(); + } break; default: GGML_ASSERT(false); } @@ -10295,6 +10508,8 @@ size_t llama_max_devices(void) { return GGML_CUDA_MAX_DEVICES; #elif defined(GGML_USE_SYCL) return GGML_SYCL_MAX_DEVICES; +#elif defined(GGML_USE_VULKAN) + return GGML_VK_MAX_DEVICES; #else return 1; #endif @@ -10502,13 +10717,15 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_VULKAN) if (model->n_gpu_layers > 0) { - ggml_backend_t backend = ggml_backend_vk_init(); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__); - llama_free(ctx); - return nullptr; + for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_vk_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); } - ctx->backends.push_back(backend); } #elif defined(GGML_USE_SYCL) if (model->n_gpu_layers > 0) {