Merge branch 'ggerganov:master' into master

This commit is contained in:
Jianlin Shi 2025-01-17 15:57:22 -07:00 committed by GitHub
commit 059325d9b4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
160 changed files with 10578 additions and 7369 deletions

1
.gitignore vendored
View file

@ -18,6 +18,7 @@
*.metallib *.metallib
*.o *.o
*.so *.so
*.swp
*.tmp *.tmp
# IDE / OS # IDE / OS

View file

@ -20,14 +20,104 @@
- Avoid adding third-party dependencies, extra files, extra headers, etc. - Avoid adding third-party dependencies, extra files, extra headers, etc.
- Always consider cross-compatibility with other operating systems and architectures - Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple - Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit - Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963) - Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
- Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
- In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
```cpp
// OK
llama_context * ctx;
const llama_rope_type rope_type;
// not OK
struct llama_context * ctx;
const enum llama_rope_type rope_type;
```
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
- For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
![matmul](media/matmul.png) ![matmul](media/matmul.png)
# Naming guidelines
- Use `snake_case` for function, variable and type names
- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
```cpp
// not OK
int small_number;
int big_number;
// OK
int number_small;
int number_big;
```
- Enum values are always in upper case and prefixed with the enum name
```cpp
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_NONE = 0,
LLAMA_VOCAB_TYPE_SPM = 1,
LLAMA_VOCAB_TYPE_BPE = 2,
LLAMA_VOCAB_TYPE_WPM = 3,
LLAMA_VOCAB_TYPE_UGM = 4,
LLAMA_VOCAB_TYPE_RWKV = 5,
};
```
- The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
```cpp
llama_model_init(); // class: "llama_model", method: "init"
llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
llama_sampler_get_seed(); // class: "llama_sampler", method: "get_seed"
llama_set_embeddings(); // class: "llama_context", method: "set_embeddings"
llama_n_threads(); // class: "llama_context", method: "n_threads"
llama_adapter_lora_free(); // class: "llama_adapter_lora", method: "free"
```
- The `get` `<action>` can be omitted
- The `<noun>` can be omitted if not necessary
- The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
- Use `init`/`free` for constructor/destructor `<action>`
- Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
```cpp
typedef struct llama_context * llama_context_t;
enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
```
_(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
- C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
- Python filenames are all lowercase with underscores
- _(TODO: abbreviations usage)_
# Preprocessor directives
- _(TODO: add guidelines with examples and apply them to the codebase)_
```cpp
#ifdef FOO
#endif // FOO
```
# Documentation
- Documentation is a community effort
- When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
- When you notice incorrect or outdated documentation, please update it
# Resources # Resources
The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects: The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:

View file

@ -99,6 +99,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
- [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
- [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
#### Multimodal #### Multimodal
@ -203,6 +204,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
</details> </details>
@ -244,6 +246,8 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
- [Trending](https://huggingface.co/models?library=gguf&sort=trending) - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf) - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
After downloading a model, use the CLI tools to run it locally - see below. After downloading a model, use the CLI tools to run it locally - see below.
`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. `llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
@ -262,21 +266,12 @@ To learn more about model quantization, [read this documentation](examples/quant
#### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality. #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
- <details open> - <details open>
<summary>Run simple text completion</summary>
```bash
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
</details>
- <details>
<summary>Run in conversation mode</summary> <summary>Run in conversation mode</summary>
Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
```bash ```bash
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv llama-cli -m model.gguf
# > hi, who are you? # > hi, who are you?
# Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today? # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
@ -288,17 +283,28 @@ To learn more about model quantization, [read this documentation](examples/quant
</details> </details>
- <details> - <details>
<summary>Run with custom chat template</summary> <summary>Run in conversation mode with custom chat template</summary>
```bash ```bash
# use the "chatml" template # use the "chatml" template (use -h to see the list of supported templates)
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml llama-cli -m model.gguf -cnv --chat-template chatml
# use a custom template # use a custom template
llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:' llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
``` ```
[Supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) </details>
- <details>
<summary>Run simple text completion</summary>
To disable conversation mode explicitly, use `-no-cnv`
```bash
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
</details> </details>

View file

@ -326,17 +326,17 @@ function gg_run_open_llama_7b_v2 {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -460,17 +460,17 @@ function gg_run_pythia_1_4b {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -591,17 +591,17 @@ function gg_run_pythia_2_8b {
./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/llama-cli --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-cli -no-cnv --model ${model_f16} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

View file

@ -130,17 +130,26 @@ std::string common_arg::to_string() {
static void common_params_handle_model_default( static void common_params_handle_model_default(
std::string & model, std::string & model,
std::string & model_url, const std::string & model_url,
std::string & hf_repo, std::string & hf_repo,
std::string & hf_file) { std::string & hf_file,
const std::string & hf_token) {
if (!hf_repo.empty()) { if (!hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model // short-hand to avoid specifying --hf-file -> default it to --model
if (hf_file.empty()) { if (hf_file.empty()) {
if (model.empty()) { if (model.empty()) {
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); auto auto_detected = common_get_hf_file(hf_repo, hf_token);
if (auto_detected.first.empty() || auto_detected.second.empty()) {
exit(1); // built without CURL, error message already printed
} }
hf_repo = auto_detected.first;
hf_file = auto_detected.second;
} else {
hf_file = model; hf_file = model;
} else if (model.empty()) { }
}
// make sure model path is present (for caching purposes)
if (model.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs // this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = hf_repo + "_" + hf_file; std::string filename = hf_repo + "_" + hf_file;
// to make sure we don't have any slashes in the filename // to make sure we don't have any slashes in the filename
@ -290,8 +299,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
} }
// TODO: refactor model params in a common struct // TODO: refactor model params in a common struct
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file); common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token);
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file); common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token);
if (params.escape) { if (params.escape) {
string_process_escapes(params.prompt); string_process_escapes(params.prompt);
@ -367,6 +376,30 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
return devices; return devices;
} }
static void add_rpc_devices(std::string servers) {
auto rpc_servers = string_split<std::string>(servers, ',');
if (rpc_servers.empty()) {
throw std::invalid_argument("no RPC servers specified");
}
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
throw std::invalid_argument("failed to find RPC backend");
}
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
throw std::invalid_argument("failed to find RPC device add function");
}
for (const auto & server : rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
ggml_backend_device_register(dev);
} else {
throw std::invalid_argument("failed to register RPC device");
}
}
}
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage); auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params const common_params params_org = ctx_arg.params; // the example can modify the default params
@ -768,15 +801,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-cnv", "--conversation"}, {"-cnv", "--conversation"},
string_format(
"run in conversation mode:\n" "run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n" "- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n" "- interactive mode is also enabled\n"
"(default: %s)", "(default: auto enabled if chat template is available)",
params.conversation ? "true" : "false"
),
[](common_params & params) { [](common_params & params) {
params.conversation = true; params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg(
{"-no-cnv", "--no-conversation"},
"force disable conversation mode (default: false)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(common_arg( add_opt(common_arg(
@ -1372,7 +1409,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--rpc"}, "SERVERS", {"--rpc"}, "SERVERS",
"comma separated list of RPC servers", "comma separated list of RPC servers",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.rpc_servers = value; add_rpc_devices(value);
GGML_UNUSED(params);
} }
).set_env("LLAMA_ARG_RPC")); ).set_env("LLAMA_ARG_RPC"));
} }
@ -1583,21 +1621,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_env("LLAMA_ARG_MODEL_URL")); ).set_env("LLAMA_ARG_MODEL_URL"));
add_opt(common_arg( add_opt(common_arg(
{"-hfr", "--hf-repo"}, "REPO", {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository (default: unused)", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
"example: unsloth/phi-4-GGUF:q4_k_m\n"
"(default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.hf_repo = value; params.hf_repo = value;
} }
).set_env("LLAMA_ARG_HF_REPO")); ).set_env("LLAMA_ARG_HF_REPO"));
add_opt(common_arg( add_opt(common_arg(
{"-hff", "--hf-file"}, "FILE", {"-hff", "--hf-file"}, "FILE",
"Hugging Face model file (default: unused)", "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.hf_file = value; params.hf_file = value;
} }
).set_env("LLAMA_ARG_HF_FILE")); ).set_env("LLAMA_ARG_HF_FILE"));
add_opt(common_arg( add_opt(common_arg(
{"-hfrv", "--hf-repo-v"}, "REPO", {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
"Hugging Face model repository for the vocoder model (default: unused)", "Hugging Face model repository for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.vocoder.hf_repo = value; params.vocoder.hf_repo = value;

View file

@ -73,6 +73,22 @@
#include <sys/syslimits.h> #include <sys/syslimits.h>
#endif #endif
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
};
#endif // LLAMA_USE_CURL #endif // LLAMA_USE_CURL
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
@ -857,21 +873,23 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams; return iparams;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
if (params.reranking) { if (params.reranking) {
bool ok = true; bool ok = true;
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) { if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__); LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) { if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__); LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) { if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__); LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
ok = false; ok = false;
} }
@ -884,7 +902,7 @@ struct common_init_result common_init_from_params(common_params & params) {
auto cparams = common_context_params_to_llama(params); auto cparams = common_context_params_to_llama(params);
llama_context * lctx = llama_new_context_with_model(model, cparams); llama_context * lctx = llama_init_from_model(model, cparams);
if (lctx == NULL) { if (lctx == NULL) {
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_model_free(model); llama_model_free(model);
@ -898,7 +916,7 @@ struct common_init_result common_init_from_params(common_params & params) {
if (!params.control_vectors.empty()) { if (!params.control_vectors.empty()) {
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
const auto cvec = common_control_vector_load(params.control_vectors); const auto cvec = common_control_vector_load(params.control_vectors);
if (cvec.n_embd == -1) { if (cvec.n_embd == -1) {
@ -908,7 +926,8 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams; return iparams;
} }
int err = llama_control_vector_apply(lctx, int err = llama_apply_adapter_cvec(
lctx,
cvec.data.data(), cvec.data.data(),
cvec.data.size(), cvec.data.size(),
cvec.n_embd, cvec.n_embd,
@ -924,8 +943,8 @@ struct common_init_result common_init_from_params(common_params & params) {
// load and optionally apply lora adapters // load and optionally apply lora adapters
for (auto & la : params.lora_adapters) { for (auto & la : params.lora_adapters) {
llama_lora_adapter_ptr lora; llama_adapter_lora_ptr lora;
lora.reset(llama_lora_adapter_init(model, la.path.c_str())); lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
if (lora == nullptr) { if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx); llama_free(lctx);
@ -938,17 +957,17 @@ struct common_init_result common_init_from_params(common_params & params) {
} }
if (!params.lora_init_without_apply) { if (!params.lora_init_without_apply) {
common_lora_adapters_apply(lctx, params.lora_adapters); common_set_adapter_lora(lctx, params.lora_adapters);
} }
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) { if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__); LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
params.sampling.ignore_eos = false; params.sampling.ignore_eos = false;
} }
if (params.sampling.ignore_eos) { if (params.sampling.ignore_eos) {
for (llama_token i = 0; i < llama_n_vocab(model); i++) { for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
if (llama_token_is_eog(model, i)) { if (llama_vocab_is_eog(vocab, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY}); params.sampling.logit_bias.push_back({i, -INFINITY});
} }
@ -969,8 +988,9 @@ struct common_init_result common_init_from_params(common_params & params) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
std::vector<llama_token> tmp; std::vector<llama_token> tmp;
llama_token bos = llama_token_bos(model); llama_token bos = llama_vocab_bos(vocab);
llama_token eos = llama_token_eos(model); llama_token eos = llama_vocab_eos(vocab);
// some models (e.g. T5) don't have a BOS token // some models (e.g. T5) don't have a BOS token
if (bos != LLAMA_TOKEN_NULL) { if (bos != LLAMA_TOKEN_NULL) {
tmp.push_back(bos); tmp.push_back(bos);
@ -1005,11 +1025,11 @@ struct common_init_result common_init_from_params(common_params & params) {
return iparams; return iparams;
} }
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) { void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
llama_lora_adapter_clear(ctx); llama_clear_adapter_lora(ctx);
for (auto & la : lora) { for (auto & la : lora) {
if (la.scale != 0.0f) { if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.ptr, la.scale); llama_set_adapter_lora(ctx, la.ptr, la.scale);
} }
} }
} }
@ -1023,7 +1043,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (params.n_gpu_layers != -1) { if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers; mparams.n_gpu_layers = params.n_gpu_layers;
} }
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu; mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode; mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split; mparams.tensor_split = params.tensor_split;
@ -1126,7 +1145,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
if (!curl) { if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__); LOG_ERR("%s: error initializing libcurl\n", __func__);
return false; return false;
@ -1140,11 +1160,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
// Check if hf-token or bearer-token was specified // Check if hf-token or bearer-token was specified
if (!hf_token.empty()) { if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer "; std::string auth_header = "Authorization: Bearer " + hf_token;
auth_header += hf_token.c_str(); http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
struct curl_slist *http_headers = NULL; curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
http_headers = curl_slist_append(http_headers, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
} }
#if defined(_WIN32) #if defined(_WIN32)
@ -1440,6 +1458,80 @@ struct llama_model * common_load_model_from_hf(
return common_load_model_from_url(model_url, local_path, hf_token, params); return common_load_model_from_url(model_url, local_path, hf_token, params);
} }
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
*
* Return pair of <repo, file> (with "repo" already having tag removed)
*
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
*/
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// fetch model info from Hugging Face Hub API
json model_info;
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
}
long res_code;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
if (res_code == 200) {
model_info = json::parse(res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
if (!model_info.contains("ggufFile")) {
throw std::runtime_error("error: model does not have ggufFile");
}
json & gguf_file = model_info.at("ggufFile");
if (!gguf_file.contains("rfilename")) {
throw std::runtime_error("error: ggufFile does not have rfilename");
}
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
}
#else #else
struct llama_model * common_load_model_from_url( struct llama_model * common_load_model_from_url(
@ -1461,6 +1553,11 @@ struct llama_model * common_load_model_from_hf(
return nullptr; return nullptr;
} }
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
return std::make_pair("", "");
}
#endif // LLAMA_USE_CURL #endif // LLAMA_USE_CURL
// //
@ -1559,21 +1656,23 @@ std::vector<llama_token> common_tokenize(
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_tokenize(vocab, text, add_special, parse_special);
} }
std::vector<llama_token> common_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_model * model, const struct llama_vocab * vocab,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special) { bool parse_special) {
// upper limit for the number of tokens // upper limit for the number of tokens
int n_tokens = text.length() + 2 * add_special; int n_tokens = text.length() + 2 * add_special;
std::vector<llama_token> result(n_tokens); std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
if (n_tokens < 0) { if (n_tokens < 0) {
result.resize(-n_tokens); result.resize(-n_tokens);
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
GGML_ASSERT(check == -n_tokens); GGML_ASSERT(check == -n_tokens);
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);
@ -1582,12 +1681,18 @@ std::vector<llama_token> common_tokenize(
} }
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_token_to_piece(vocab, token, special);
}
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
std::string piece; std::string piece;
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
if (n_chars < 0) { if (n_chars < 0) {
piece.resize(-n_chars); piece.resize(-n_chars);
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special); int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
GGML_ASSERT(check == -n_chars); GGML_ASSERT(check == -n_chars);
} }
else { else {
@ -1597,13 +1702,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
return piece; return piece;
} }
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) { std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
return common_detokenize(vocab, tokens, special);
}
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
std::string text; std::string text;
text.resize(std::max(text.capacity(), tokens.size())); text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) { if (n_chars < 0) {
text.resize(-n_chars); text.resize(-n_chars);
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special); n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
} }
@ -1618,20 +1729,13 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
// //
std::string common_get_builtin_chat_template(const struct llama_model * model) { std::string common_get_builtin_chat_template(const struct llama_model * model) {
static const char * template_key = "tokenizer.chat_template"; const char * ptr_tmpl = llama_model_chat_template(model);
// call with NULL buffer to get the total size of the string return ptr_tmpl == nullptr ? "" : ptr_tmpl;
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
if (res > 0) {
std::vector<char> model_template(res + 1, 0);
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
return std::string(model_template.data(), model_template.size() - 1);
}
return "";
} }
bool common_chat_verify_template(const std::string & tmpl) { bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}}; llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0); const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0; return res >= 0;
} }
@ -1642,16 +1746,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
int alloc_size = 0; int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat; std::vector<llama_chat_message> chat;
for (auto & msg : msgs) { for (const auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()}); chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25; alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
} }
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str(); const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
std::vector<char> buf(alloc_size); std::vector<char> buf(alloc_size);
// run the first time to get the total output length // run the first time to get the total output length
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size()); int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// error: chat template is not supported // error: chat template is not supported
if (res < 0) { if (res < 0) {
@ -1659,18 +1763,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
// if the custom "tmpl" is not supported, we throw an error // if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template() // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported"); throw std::runtime_error("this custom template is not supported");
} else {
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
} }
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
} }
// if it turns out that our buffer is too small, we resize it // if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) { if ((size_t) res > buf.size()) {
buf.resize(res); buf.resize(res);
res = llama_chat_apply_template( res = llama_chat_apply_template(
fallback ? nullptr : model,
fallback ? "chatml" : ptr_tmpl, fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size()); chat.data(), chat.size(), add_ass, buf.data(), buf.size());
} }

View file

@ -24,11 +24,11 @@
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
struct common_lora_adapter_info { struct common_adapter_lora_info {
std::string path; std::string path;
float scale; float scale;
struct llama_lora_adapter * ptr; struct llama_adapter_lora * ptr;
}; };
using llama_tokens = std::vector<llama_token>; using llama_tokens = std::vector<llama_token>;
@ -103,6 +103,12 @@ enum dimre_method {
DIMRE_METHOD_MEAN, DIMRE_METHOD_MEAN,
}; };
enum common_conversation_mode {
COMMON_CONVERSATION_MODE_DISABLED = 0,
COMMON_CONVERSATION_MODE_ENABLED = 1,
COMMON_CONVERSATION_MODE_AUTO = 2,
};
// sampling parameters // sampling parameters
struct common_params_sampling { struct common_params_sampling {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@ -240,14 +246,13 @@ struct common_params {
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -275,7 +280,6 @@ struct common_params {
bool special = false; // enable special token output bool special = false; // enable special token output
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool interactive_first = false; // wait for user input immediately bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
@ -301,6 +305,8 @@ struct common_params {
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT std::string mmproj = ""; // path to multimodal projector // NOLINT
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
@ -454,6 +460,11 @@ static bool string_starts_with(const std::string & str,
return str.rfind(prefix, 0) == 0; return str.rfind(prefix, 0) == 0;
} }
static bool string_ends_with(const std::string & str,
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
@ -481,7 +492,7 @@ struct common_init_result {
llama_model_ptr model; llama_model_ptr model;
llama_context_ptr context; llama_context_ptr context;
std::vector<llama_lora_adapter_ptr> lora; std::vector<llama_adapter_lora_ptr> lora;
}; };
struct common_init_result common_init_from_params(common_params & params); struct common_init_result common_init_from_params(common_params & params);
@ -501,9 +512,12 @@ struct llama_model * common_load_model_from_hf(
const std::string & local_path, const std::string & local_path,
const std::string & hf_token, const std::string & hf_token,
const struct llama_model_params & params); const struct llama_model_params & params);
std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);
// clear LoRA adapters from context, then apply new list of adapters // clear LoRA adapters from context, then apply new list of adapters
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora); void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
// //
// Batch utils // Batch utils
@ -541,7 +555,7 @@ std::vector<llama_token> common_tokenize(
bool parse_special = false); bool parse_special = false);
std::vector<llama_token> common_tokenize( std::vector<llama_token> common_tokenize(
const struct llama_model * model, const struct llama_vocab * vocab,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
bool parse_special = false); bool parse_special = false);
@ -553,11 +567,21 @@ std::string common_token_to_piece(
llama_token token, llama_token token,
bool special = true); bool special = true);
std::string common_token_to_piece(
const struct llama_vocab * vocab,
llama_token token,
bool special = true);
// detokenizes a vector of tokens into a string // detokenizes a vector of tokens into a string
// should work similar to Python's `tokenizer.decode` // should work similar to Python's `tokenizer.decode`
// optionally renders special/control tokens // optionally renders special/control tokens
std::string common_detokenize( std::string common_detokenize(
llama_context * ctx, const struct llama_context * ctx,
const std::vector<llama_token> & tokens,
bool special = true);
std::string common_detokenize(
const struct llama_vocab * vocab,
const std::vector<llama_token> & tokens, const std::vector<llama_token> & tokens,
bool special = true); bool special = true);

View file

@ -113,7 +113,10 @@ struct common_sampler {
void set_logits(struct llama_context * ctx, int idx) { void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx); const auto * logits = llama_get_logits_ith(ctx, idx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
cur.resize(n_vocab); cur.resize(n_vocab);
@ -142,13 +145,15 @@ std::string common_params_sampling::print() const {
} }
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) { struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_sampler_chain_params lparams = llama_sampler_chain_default_params(); llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
lparams.no_perf = params.no_perf; lparams.no_perf = params.no_perf;
auto * result = new common_sampler { auto * result = new common_sampler {
/* .params = */ params, /* .params = */ params,
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), /* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
/* .chain = */ llama_sampler_chain_init(lparams), /* .chain = */ llama_sampler_chain_init(lparams),
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)), /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
/* .cur = */ {}, /* .cur = */ {},
@ -157,7 +162,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_chain_add(result->chain,
llama_sampler_init_logit_bias( llama_sampler_init_logit_bias(
llama_n_vocab(model), llama_vocab_n_tokens(vocab),
params.logit_bias.size(), params.logit_bias.size(),
params.logit_bias.data())); params.logit_bias.data()));
@ -172,7 +177,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
c_breakers.push_back(str.c_str()); c_breakers.push_back(str.c_str());
} }
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
} }
break; break;
case COMMON_SAMPLER_TYPE_TOP_K: case COMMON_SAMPLER_TYPE_TOP_K:
@ -194,7 +199,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
break; break;
case COMMON_SAMPLER_TYPE_INFILL: case COMMON_SAMPLER_TYPE_INFILL:
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
break; break;
case COMMON_SAMPLER_TYPE_PENALTIES: case COMMON_SAMPLER_TYPE_PENALTIES:
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
@ -206,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
} else if (params.mirostat == 1) { } else if (params.mirostat == 1) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
} else if (params.mirostat == 2) { } else if (params.mirostat == 2) {
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));

View file

@ -79,10 +79,13 @@ bool common_speculative_are_compatible(
const struct llama_model * model_tgt = llama_get_model(ctx_tgt); const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
const struct llama_model * model_dft = llama_get_model(ctx_dft); const struct llama_model * model_dft = llama_get_model(ctx_dft);
const bool vocab_type_tgt = llama_vocab_type(model_tgt); const struct llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
const struct llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
const bool vocab_type_dft = llama_vocab_type(model_dft); const bool vocab_type_dft = llama_vocab_type(vocab_dft);
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) { if (vocab_type_tgt != vocab_type_dft) {
@ -91,34 +94,34 @@ bool common_speculative_are_compatible(
return false; return false;
} }
if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) || llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)) { llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt)); LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft)); LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
return false; return false;
} }
{ {
const int n_vocab_tgt = llama_n_vocab(model_tgt); const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
const int n_vocab_dft = llama_n_vocab(model_dft); const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
"target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
__func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return false; return false;
} }
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_token_get_text(model_tgt, i); const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
const char * token_text_dft = llama_token_get_text(model_dft, i); const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) { if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_ERR("%s: draft model vocab must match target model to use speculation but " LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
"token %d content differs - target '%s', draft '%s'\n", __func__, i, "token %d content differs - target '%s', draft '%s'\n", __func__, i,
common_token_to_piece(ctx_tgt, i).c_str(), common_token_to_piece(ctx_tgt, i).c_str(),
common_token_to_piece(ctx_dft, i).c_str()); common_token_to_piece(ctx_dft, i).c_str());

View file

@ -326,6 +326,7 @@ class Model:
gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_W2,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
gguf.MODEL_TENSOR.TIME_MIX_LERP_FUSED,
gguf.MODEL_TENSOR.POSNET_NORM1, gguf.MODEL_TENSOR.POSNET_NORM1,
gguf.MODEL_TENSOR.POSNET_NORM2, gguf.MODEL_TENSOR.POSNET_NORM2,
) )
@ -477,6 +478,11 @@ class Model:
return modelcls return modelcls
return func return func
@classmethod
def print_registered_models(cls):
for name in sorted(cls._model_classes.keys()):
logger.error(f"- {name}")
@classmethod @classmethod
def from_model_architecture(cls, arch: str) -> type[Model]: def from_model_architecture(cls, arch: str) -> type[Model]:
try: try:
@ -2876,6 +2882,66 @@ class InternLM2Model(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@Model.register("InternLM3ForCausalLM")
class InternLM3Model(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
def set_vocab(self):
tokens, scores, toktypes = self._create_vocab_sentencepiece()
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
if "added_tokens_decoder" in tokenizer_config_json:
for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
if token_data.get("special"):
token_id = int(token_id)
token = token_data["content"]
special_vocab._set_special_token(token, token_id)
# update eos token
if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
special_vocab.special_token_ids["eos"] = token_id
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "head_dim" in hparams:
rope_dim = hparams["head_dim"]
else:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
return [(self.map_tensor_name(name), data_torch)]
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") @Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
class BertModel(Model): class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT model_arch = gguf.MODEL_ARCH.BERT
@ -3316,6 +3382,8 @@ class Rwkv6Model(Model):
# required by llama.cpp, unused # required by llama.cpp, unused
self.gguf_writer.add_head_count(0) self.gguf_writer.add_head_count(0)
lerp_weights: dict[int, dict[str, Tensor]] = {}
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
new_name = self.map_tensor_name(name) new_name = self.map_tensor_name(name)
@ -3331,14 +3399,84 @@ class Rwkv6Model(Model):
if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
data_torch = data_torch.squeeze() data_torch = data_torch.squeeze()
try:
rescale_every_n_layers = self.hparams["rescale_every"] rescale_every_n_layers = self.hparams["rescale_every"]
if rescale_every_n_layers > 0: if rescale_every_n_layers > 0:
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
except KeyError:
pass
# concat time_mix_lerp weights to reduce some cpu overhead
# also reduces the number of tensors in the model
if bid is not None and "time_mix_lerp" in new_name and "time_mix_lerp_x" not in new_name:
try:
self.lerp_weights[bid][new_name] = data_torch
except KeyError:
self.lerp_weights[bid] = {new_name: data_torch}
if all(f"blk.{bid}.time_mix_lerp_{i}.weight" in self.lerp_weights[bid].keys() for i in ["w", "k", "v", "r", "g"]):
new_name = f"blk.{bid}.time_mix_lerp_fused.weight"
data = torch.stack([self.lerp_weights[bid][f"blk.{bid}.time_mix_lerp_{i}.weight"].unsqueeze(0) for i in ["w", "k", "v", "r", "g"]], dim=0).unsqueeze(1)
yield (new_name, data)
return
yield (new_name, data_torch) yield (new_name, data_torch)
@Model.register("RWKV6Qwen2ForCausalLM")
class RWKV6Qwen2Model(Rwkv6Model):
model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
def set_vocab(self):
try:
self._set_vocab_sentencepiece()
except FileNotFoundError:
self._set_vocab_gpt2()
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
num_attention_heads = self.hparams["num_attention_heads"]
num_key_value_heads = self.hparams["num_key_value_heads"]
hidden_size = self.hparams["hidden_size"]
head_size = hidden_size // num_attention_heads
rms_norm_eps = self.hparams["rms_norm_eps"]
intermediate_size = self.hparams["intermediate_size"]
time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
# RWKV isn't context limited
self.gguf_writer.add_context_length(1048576)
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_wkv_head_size(head_size)
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_file_type(self.ftype)
# special parameters for time_mixing in RWKV6QWEN2
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_token_shift_count(1)
# RWKV6QWEN2 use grouped key/value like GQA
self.gguf_writer.add_head_count_kv(num_key_value_heads)
# required by llama.cpp, unused
self.gguf_writer.add_head_count(0)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
for new_name, data in super().modify_tensors(data_torch, name, bid):
if "time_mix_w1" in new_name or "time_mix_w2" in new_name:
data = data.view(5, -1, data.shape[-1])
# rwkv6qwen2 has a different order of rkvwg instead of the original wkvrg
# permute them here to avoid code changes
data = torch.stack([data[3], data[1], data[2], data[0], data[4]], dim=0).view(-1, data.shape[-1])
if "w2" in new_name:
data = data.view(5, -1, data.shape[-1])
yield (new_name, data)
continue
yield (new_name, data)
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(Model): class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA model_arch = gguf.MODEL_ARCH.MAMBA
@ -4856,6 +4994,7 @@ def parse_args() -> argparse.Namespace:
parser.add_argument( parser.add_argument(
"model", type=Path, "model", type=Path,
help="directory containing model file", help="directory containing model file",
nargs="?",
) )
parser.add_argument( parser.add_argument(
"--use-temp-file", action="store_true", "--use-temp-file", action="store_true",
@ -4893,8 +5032,15 @@ def parse_args() -> argparse.Namespace:
"--metadata", type=Path, "--metadata", type=Path,
help="Specify the path for an authorship metadata override file" help="Specify the path for an authorship metadata override file"
) )
parser.add_argument(
"--print-supported-models", action="store_true",
help="Print the supported models"
)
return parser.parse_args() args = parser.parse_args()
if not args.print_supported_models and args.model is None:
parser.error("the following arguments are required: model")
return args
def split_str_to_n_bytes(split_str: str) -> int: def split_str_to_n_bytes(split_str: str) -> int:
@ -4918,6 +5064,11 @@ def split_str_to_n_bytes(split_str: str) -> int:
def main() -> None: def main() -> None:
args = parse_args() args = parse_args()
if args.print_supported_models:
logger.error("Supported models:")
Model.print_registered_models()
sys.exit(0)
if args.verbose: if args.verbose:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
else: else:

View file

@ -50,7 +50,7 @@ int main(int argc, char ** argv) {
// ensure enough sequences are available // ensure enough sequences are available
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);

View file

@ -23,12 +23,12 @@ defer {
} }
let model_params = llama_model_default_params() let model_params = llama_model_default_params()
guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else { guard let model = llama_model_load_from_file(modelPath.cString(using: .utf8), model_params) else {
print("Failed to load model") print("Failed to load model")
exit(1) exit(1)
} }
defer { defer {
llama_free_model(model) llama_model_free(model)
} }
var tokens = tokenize(text: prompt, add_bos: true) var tokens = tokenize(text: prompt, add_bos: true)
@ -141,7 +141,7 @@ while n_cur <= n_len {
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
// is it an end of stream? -> mark the stream as finished // is it an end of stream? -> mark the stream as finished
if llama_token_is_eog(model, new_token_id) || n_cur == n_len { if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
i_batch[i] = -1 i_batch[i] = -1
// print("") // print("")
if n_parallel > 1 { if n_parallel > 1 {

View file

@ -48,10 +48,12 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = common_tokenize(model, params.prompt, true); tokens_list = common_tokenize(vocab, params.prompt, true);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel; const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
@ -62,7 +64,7 @@ int main(int argc, char ** argv) {
ctx_params.n_ctx = n_kv_req; ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_predict, n_parallel); ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false; sparams.no_perf = false;
@ -121,7 +123,7 @@ int main(int argc, char ** argv) {
llama_token decoder_start_token_id = llama_model_decoder_start_token(model); llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == LLAMA_TOKEN_NULL) { if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = llama_token_bos(model); decoder_start_token_id = llama_vocab_bos(vocab);
} }
common_batch_clear(batch); common_batch_clear(batch);
@ -174,7 +176,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
// is it an end of generation? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1; i_batch[i] = -1;
LOG("\n"); LOG("\n");
if (n_parallel > 1) { if (n_parallel > 1) {

View file

@ -911,7 +911,7 @@ int main(int argc, char ** argv) {
load_vocab(params.fn_vocab_model, &config, &vocab); load_vocab(params.fn_vocab_model, &config, &vocab);
struct my_llama_model model; struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); model.hparams.n_vocab = config.vocab_size; //llama_vocab_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx; model.hparams.n_ctx = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd; model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim; model.hparams.n_ff = config.hidden_dim;

View file

@ -273,7 +273,9 @@ struct tokenized_prompt {
size_t max_seq_len; size_t max_seq_len;
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) { tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
tokens_pos = common_tokenize(ctx, pos, add_bos, true); tokens_pos = common_tokenize(ctx, pos, add_bos, true);
tokens_neg = common_tokenize(ctx, neg, add_bos, true); tokens_neg = common_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size()); max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@ -421,8 +423,8 @@ int main(int argc, char ** argv) {
llama_context * ctx = llama_init.context.get(); llama_context * ctx = llama_init.context.get();
// int n_ctx = llama_n_ctx(ctx); // int n_ctx = llama_n_ctx(ctx);
int n_layers = llama_n_layer(model); int n_layers = llama_model_n_layer(model);
int n_embd = llama_n_embd(model); int n_embd = llama_model_n_embd(model);
// get model hint param (a.k.a model arch name) // get model hint param (a.k.a model arch name)
char model_hint[128]; char model_hint[128];

View file

@ -105,7 +105,9 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@ -148,7 +150,7 @@ int main(int argc, char ** argv) {
// check if the last token is SEP // check if the last token is SEP
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
for (auto & inp : inputs) { for (auto & inp : inputs) {
if (inp.empty() || inp.back() != llama_token_sep(model)) { if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__); LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
} }
@ -181,7 +183,7 @@ int main(int argc, char ** argv) {
} }
// allocate output // allocate output
const int n_embd = llama_n_embd(model); const int n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(n_embd_count * n_embd, 0); std::vector<float> embeddings(n_embd_count * n_embd, 0);
float * emb = embeddings.data(); float * emb = embeddings.data();

View file

@ -127,7 +127,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
} }
static bool run(llama_context * ctx, const common_params & params) { static bool run(llama_context * ctx, const common_params & params) {
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos); std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);

View file

@ -8,7 +8,6 @@
#include <map> #include <map>
#include <vector> #include <vector>
#include <string> #include <string>
#include <thread>
#include <fstream> #include <fstream>
static bool g_verbose = false; static bool g_verbose = false;
@ -130,7 +129,7 @@ struct lora_merge_ctx {
lora_merge_ctx( lora_merge_ctx(
std::string & base_fname, std::string & base_fname,
std::vector<common_lora_adapter_info> & lora_files, std::vector<common_adapter_lora_info> & lora_files,
std::string & outfile, std::string & outfile,
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
fout.exceptions(std::ofstream::failbit); // fail fast on write errors fout.exceptions(std::ofstream::failbit); // fail fast on write errors

View file

@ -41,7 +41,7 @@ echo PASS
echo echo
# 2b. Test the sharded model is loading properly # 2b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32 $MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS echo PASS
echo echo
@ -51,7 +51,7 @@ echo PASS
echo echo
# 3b. Test the merged model is loading properly # 3b. Test the merged model is loading properly
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32 $MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS echo PASS
echo echo
@ -61,7 +61,7 @@ echo PASS
echo echo
# 4b. Test the sharded model is loading properly # 4b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32 $MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS echo PASS
echo echo
@ -71,7 +71,7 @@ echo
#echo #echo
# 5b. Test the merged model is loading properly # 5b. Test the merged model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32 #$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS #echo PASS
#echo #echo
@ -81,7 +81,7 @@ echo PASS
echo echo
# 6b. Test the sharded model is loading properly # 6b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32 $MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS echo PASS
echo echo

View file

@ -11,6 +11,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
std::vector<std::vector<float>> result; std::vector<std::vector<float>> result;
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
@ -19,16 +20,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
const std::string input_string = instruction + sentences[i]; const std::string input_string = instruction + sentences[i];
std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false); std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
const int32_t n_toks = inputs.size(); const int32_t n_toks = inputs.size();
// GritLM seems to have EOS = "" // GritLM seems to have EOS = ""
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
// inputs.push_back(llama_token_eos(model)); // inputs.push_back(llama_vocab_eos(vocab));
// we want to ignore instruction tokens for mean pooling // we want to ignore instruction tokens for mean pooling
const int32_t n_inst = common_tokenize(model, instruction, true, false).size(); const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
#ifdef GRIT_DEBUG #ifdef GRIT_DEBUG
// debug tokens - should be matching as referenced in the GritLM sample // debug tokens - should be matching as referenced in the GritLM sample
@ -52,7 +53,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
llama_decode(ctx, batch); llama_decode(ctx, batch);
// get embedding dimensions // get embedding dimensions
uint64_t n_embd = llama_n_embd(model); uint64_t n_embd = llama_model_n_embd(model);
// allocate embedding output // allocate embedding output
std::vector<float> emb_unorm(n_embd, 0.0f); std::vector<float> emb_unorm(n_embd, 0.0f);
@ -97,7 +98,9 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
std::string result; std::string result;
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
llama_token eos_token = llama_token_eos(model); const llama_vocab * vocab = llama_model_get_vocab(model);
llama_token eos_token = llama_vocab_eos(vocab);
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false); llama_set_embeddings(ctx, false);
@ -105,7 +108,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true); std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
int32_t i_current_token = 0; int32_t i_current_token = 0;
while (true) { while (true) {
@ -168,7 +171,7 @@ int main(int argc, char * argv[]) {
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams); llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
// create generation context // create generation context
llama_context * ctx = llama_new_context_with_model(model, cparams); llama_context * ctx = llama_init_from_model(model, cparams);
auto sparams = llama_sampler_chain_default_params(); auto sparams = llama_sampler_chain_default_params();
@ -197,7 +200,7 @@ int main(int argc, char * argv[]) {
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction("")); const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
const int n_embd = llama_n_embd(model); const int n_embd = llama_model_n_embd(model);
const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);

View file

@ -7,7 +7,6 @@
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <sstream>
#include <thread> #include <thread>
#include <mutex> #include <mutex>
#include <vector> #include <vector>
@ -40,7 +39,7 @@ public:
void set_params(common_params params) { m_params = std::move(params); } void set_params(common_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix(int ncall = -1) const; void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name); bool load_imatrix(const char * fname);
private: private:
std::unordered_map<std::string, Stats> m_stats; std::unordered_map<std::string, Stats> m_stats;
common_params m_params; common_params m_params;
@ -429,10 +428,13 @@ static void process_logits(
} }
static bool compute_imatrix(llama_context * ctx, const common_params & params) { static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
auto tim1 = std::chrono::high_resolution_clock::now(); auto tim1 = std::chrono::high_resolution_clock::now();
LOG_INF("%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
@ -468,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
const int n_chunk_max = tokens.size() / n_ctx; const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_vocab_n_tokens(vocab);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
int count = 0; int count = 0;
@ -508,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (add_bos && j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); tokens[batch_start] = llama_vocab_bos(vocab);
} }
common_batch_clear(batch); common_batch_clear(batch);
@ -627,7 +629,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_model_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) { if (params.n_ctx > n_ctx_train) {
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx); __func__, n_ctx_train, params.n_ctx);

View file

@ -139,7 +139,9 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG_DBG("n_ctx: %d\n", n_ctx); LOG_DBG("n_ctx: %d\n", n_ctx);
@ -152,28 +154,28 @@ int main(int argc, char ** argv) {
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("%s\n", common_params_get_system_info(params).c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str());
} }
const bool add_bos = llama_add_bos_token(model); const bool add_bos = llama_vocab_get_add_bos(vocab);
GGML_ASSERT(!llama_add_eos_token(model)); GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
std::vector<llama_token> embd_end; std::vector<llama_token> embd_end;
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
GGML_ASSERT(llama_token_fim_pre(model) >= 0); GGML_ASSERT(llama_vocab_fim_pre(vocab) >= 0);
GGML_ASSERT(llama_token_fim_suf(model) >= 0); GGML_ASSERT(llama_vocab_fim_suf(vocab) >= 0);
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) { if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
} }
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
const llama_token middle_token = llama_token_fim_mid(model); const llama_token middle_token = llama_vocab_fim_mid(vocab);
if (middle_token >= 0) { if (middle_token >= 0) {
embd_inp.push_back(middle_token); embd_inp.push_back(middle_token);
} }
@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_vocab_bos(vocab));
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} }
@ -420,10 +422,10 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs; // if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) { if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode // deal with eot token in infill mode
if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){ if ((common_sampler_last(smpl) == llama_vocab_eot(vocab) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) { if (is_interacting && !params.interactive_first) {
// print an eot token // print an eot token
LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
} }
LOG("\n"); LOG("\n");
console::set_display(console::user_input); console::set_display(console::user_input);
@ -463,13 +465,13 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false); std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false); std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model)); inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model)); inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
embd_inp = params.spm_infill ? inp_sfx : inp_pfx; embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
embd_end = params.spm_infill ? inp_pfx : inp_sfx; embd_end = params.spm_infill ? inp_pfx : inp_sfx;
if (add_bos) { if (add_bos) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
} }
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
@ -484,7 +486,7 @@ int main(int argc, char ** argv) {
is_interacting = false; is_interacting = false;
} }
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
else if (llama_token_is_eog(model, common_sampler_last(smpl))) { else if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
LOG_DBG("found EOS token\n"); LOG_DBG("found EOS token\n");
if (params.interactive) { if (params.interactive) {
@ -500,7 +502,7 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG_DBG("adding input prefix BOS token\n"); LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_vocab_bos(vocab));
} }
std::string buffer; std::string buffer;
@ -563,7 +565,7 @@ int main(int argc, char ** argv) {
} }
// end of generation // end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) { if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !params.interactive) {
break; break;
} }
@ -575,7 +577,7 @@ int main(int argc, char ** argv) {
} }
} }
if (!params.interactive && n_remain <= 0) { if (!params.interactive && n_remain <= 0) {
LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str()); LOG("%s", common_token_to_piece(ctx, llama_vocab_eot(vocab)).c_str());
} }
LOG("\n"); LOG("\n");

View file

@ -683,7 +683,7 @@ struct cmd_params_instance {
bool cpu_strict; bool cpu_strict;
int poll; int poll;
int n_gpu_layers; int n_gpu_layers;
std::string rpc_servers; std::string rpc_servers_str;
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
@ -696,8 +696,37 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params(); llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers; mparams.n_gpu_layers = n_gpu_layers;
if (!rpc_servers_str.empty()) {
auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
// add RPC devices
if (!rpc_servers.empty()) { if (!rpc_servers.empty()) {
mparams.rpc_servers = rpc_servers.c_str(); ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
exit(1);
}
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
exit(1);
}
static std::vector<ggml_backend_dev_t> devices;
devices.clear();
for (const std::string & server : rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
devices.push_back(dev);
} else {
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
exit(1);
}
}
devices.push_back(nullptr);
mparams.devices = devices.data();
}
} }
mparams.split_mode = split_mode; mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu; mparams.main_gpu = main_gpu;
@ -708,7 +737,7 @@ struct cmd_params_instance {
} }
bool equal_mparams(const cmd_params_instance & other) const { bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers && return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
tensor_split == other.tensor_split; tensor_split == other.tensor_split;
} }
@ -1401,7 +1430,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const int32_t n_vocab = llama_n_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
std::vector<llama_token> tokens(n_batch); std::vector<llama_token> tokens(n_batch);
@ -1409,7 +1439,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
while (n_processed < n_prompt) { while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch); int n_tokens = std::min(n_prompt - n_processed, n_batch);
tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
for (int i = 1; i < n_tokens; i++) { for (int i = 1; i < n_tokens; i++) {
tokens[i] = std::rand() % n_vocab; tokens[i] = std::rand() % n_vocab;
} }
@ -1424,9 +1454,10 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
llama_set_n_threads(ctx, n_threads, n_threads); llama_set_n_threads(ctx, n_threads, n_threads);
const llama_model * model = llama_get_model(ctx); const llama_model * model = llama_get_model(ctx);
const int32_t n_vocab = llama_n_vocab(model); const llama_vocab * vocab = llama_model_get_vocab(model);
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
for (int i = 0; i < n_gen; i++) { for (int i = 0; i < n_gen; i++) {
llama_decode(ctx, llama_batch_get_one(&token, 1)); llama_decode(ctx, llama_batch_get_one(&token, 1));
@ -1537,7 +1568,7 @@ int main(int argc, char ** argv) {
prev_inst = &inst; prev_inst = &inst;
} }
llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams()); llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
llama_model_free(lmodel); llama_model_free(lmodel);

View file

@ -87,7 +87,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
auto path_to_model = env->GetStringUTFChars(filename, 0); auto path_to_model = env->GetStringUTFChars(filename, 0);
LOGi("Loading model from %s", path_to_model); LOGi("Loading model from %s", path_to_model);
auto model = llama_load_model_from_file(path_to_model, model_params); auto model = llama_model_load_from_file(path_to_model, model_params);
env->ReleaseStringUTFChars(filename, path_to_model); env->ReleaseStringUTFChars(filename, path_to_model);
if (!model) { if (!model) {
@ -102,7 +102,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) { Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast<llama_model *>(model)); llama_model_free(reinterpret_cast<llama_model *>(model));
} }
extern "C" extern "C"
@ -347,6 +347,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
jlong context_pointer, jlong context_pointer,
jlong batch_pointer, jlong batch_pointer,
jstring jtext, jstring jtext,
jboolean format_chat,
jint n_len jint n_len
) { ) {
@ -356,7 +357,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
const auto context = reinterpret_cast<llama_context *>(context_pointer); const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer); const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto tokens_list = common_tokenize(context, text, 1); bool parse_special = (format_chat == JNI_TRUE);
const auto tokens_list = common_tokenize(context, text, true, parse_special);
auto n_ctx = llama_n_ctx(context); auto n_ctx = llama_n_ctx(context);
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@ -368,7 +370,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
} }
for (auto id : tokens_list) { for (auto id : tokens_list) {
LOGi("%s", common_token_to_piece(context, id).c_str()); LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
} }
common_batch_clear(*batch); common_batch_clear(*batch);
@ -405,6 +407,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer); const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer); const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
const auto model = llama_get_model(context); const auto model = llama_get_model(context);
const auto vocab = llama_model_get_vocab(model);
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
@ -414,7 +417,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
const auto new_token_id = llama_sampler_sample(sampler, context, -1); const auto new_token_id = llama_sampler_sample(sampler, context, -1);
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
return nullptr; return nullptr;
} }

View file

@ -65,6 +65,7 @@ class LLamaAndroid {
context: Long, context: Long,
batch: Long, batch: Long,
text: String, text: String,
formatChat: Boolean,
nLen: Int nLen: Int
): Int ): Int
@ -115,10 +116,10 @@ class LLamaAndroid {
} }
} }
fun send(message: String): Flow<String> = flow { fun send(message: String, formatChat: Boolean = false): Flow<String> = flow {
when (val state = threadLocalState.get()) { when (val state = threadLocalState.get()) {
is State.Loaded -> { is State.Loaded -> {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen))
while (ncur.value <= nlen) { while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur) val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
if (str == null) { if (str == null) {

View file

@ -52,8 +52,8 @@ actor LlamaContext {
deinit { deinit {
llama_sampler_free(sampling) llama_sampler_free(sampling)
llama_batch_free(batch) llama_batch_free(batch)
llama_model_free(model)
llama_free(context) llama_free(context)
llama_free_model(model)
llama_backend_free() llama_backend_free()
} }
@ -65,7 +65,7 @@ actor LlamaContext {
model_params.n_gpu_layers = 0 model_params.n_gpu_layers = 0
print("Running on simulator, force use n_gpu_layers = 0") print("Running on simulator, force use n_gpu_layers = 0")
#endif #endif
let model = llama_load_model_from_file(path, model_params) let model = llama_model_load_from_file(path, model_params)
guard let model else { guard let model else {
print("Could not load model at \(path)") print("Could not load model at \(path)")
throw LlamaError.couldNotInitializeContext throw LlamaError.couldNotInitializeContext
@ -151,7 +151,7 @@ actor LlamaContext {
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1) new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
if llama_token_is_eog(model, new_token_id) || n_cur == n_len { if llama_vocab_is_eog(model, new_token_id) || n_cur == n_len {
print("\n") print("\n")
is_done = true is_done = true
let new_token_str = String(cString: temporary_invalid_cchars + [0]) let new_token_str = String(cString: temporary_invalid_cchars + [0])

View file

@ -47,8 +47,12 @@ static const char * sample(struct common_sampler * smpl,
int * n_past) { int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);
static std::string ret; static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>"; ret = "</s>";
} else { } else {
ret = common_token_to_piece(ctx_llama, id); ret = common_token_to_piece(ctx_llama, id);
@ -239,11 +243,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params); llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);

View file

@ -384,7 +384,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) { bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
// make sure that the correct mmproj was used, i.e., compare apples to apples // make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip); auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) { if (n_image_embd != n_llama_embd) {
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
@ -456,7 +456,7 @@ struct llava_embd_batch {
}; };
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) { bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama)); int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) { for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
int n_eval = image_embed->n_image_pos - i; int n_eval = image_embed->n_image_pos - i;

View file

@ -54,7 +54,7 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
ctx_params.n_ctx = params->n_ctx; ctx_params.n_ctx = params->n_ctx;
} }
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@ -167,8 +167,12 @@ static const char * sample(struct common_sampler * smpl,
int * n_past) { int * n_past) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);
static std::string ret; static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>"; ret = "</s>";
} else { } else {
ret = common_token_to_piece(ctx_llama, id); ret = common_token_to_piece(ctx_llama, id);

View file

@ -27,7 +27,7 @@
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) { int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama)); int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
const int patch_size = 14 * 2; const int patch_size = 14 * 2;
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0); const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0); const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
@ -132,8 +132,12 @@ static const char * sample(struct common_sampler * smpl,
int * n_past, int * st_pos_id) { int * n_past, int * st_pos_id) {
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
common_sampler_accept(smpl, id, true); common_sampler_accept(smpl, id, true);
const llama_model * model = llama_get_model(ctx_llama);
const llama_vocab * vocab = llama_model_get_vocab(model);
static std::string ret; static std::string ret;
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { if (llama_vocab_is_eog(vocab, id)) {
ret = "</s>"; ret = "</s>";
} else { } else {
ret = common_token_to_piece(ctx_llama, id); ret = common_token_to_piece(ctx_llama, id);
@ -328,11 +332,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
llama_context_params ctx_params = common_context_params_to_llama(*params); llama_context_params ctx_params = common_context_params_to_llama(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
if (ctx_llama == NULL) { if (ctx_llama == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
@ -481,7 +484,7 @@ static void debug_test_mrope_2d() {
} }
static void debug_dump_img_embed(struct llava_context * ctx_llava) { static void debug_dump_img_embed(struct llava_context * ctx_llava) {
int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama)); int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
int ne = n_embd * 4; int ne = n_embd * 4;
float vals[56 * 56 * 3]; float vals[56 * 56 * 3];
// float embd[ne]; // float embd[ne];

View file

@ -61,6 +61,8 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model.get(); llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get(); llama_context * ctx = llama_init.context.get();
const llama_vocab * vocab = llama_model_get_vocab(model);
// Tokenize the prompt // Tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
std::vector<llama_token> all; std::vector<llama_token> all;
@ -147,7 +149,7 @@ int main(int argc, char ** argv) {
} }
// here we keep adding new n-grams as we go // here we keep adding new n-grams as we go
ngram_container ngrams_observed(llama_n_vocab(model), N, G); ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
// debug // debug
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1); struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
@ -297,7 +299,7 @@ int main(int argc, char ** argv) {
} }
fflush(stdout); fflush(stdout);
if (llama_token_is_eog(model, id)) { if (llama_vocab_is_eog(vocab, id)) {
has_eos = true; has_eos = true;
} }

View file

@ -36,6 +36,8 @@ int main(int argc, char ** argv){
llama_model * model = llama_init.model.get(); llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get(); llama_context * ctx = llama_init.context.get();
const llama_vocab * vocab = llama_model_get_vocab(model);
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> inp; std::vector<llama_token> inp;
inp = common_tokenize(ctx, params.prompt, true, true); inp = common_tokenize(ctx, params.prompt, true, true);
@ -136,7 +138,7 @@ int main(int argc, char ** argv){
LOG("%s", token_str.c_str()); LOG("%s", token_str.c_str());
} }
if (llama_token_is_eog(model, id)) { if (llama_vocab_is_eog(vocab, id)) {
has_eos = true; has_eos = true;
} }

View file

@ -5,7 +5,6 @@
#include "sampling.h" #include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cassert>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
@ -31,6 +30,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
static llama_context ** g_ctx; static llama_context ** g_ctx;
static llama_model ** g_model; static llama_model ** g_model;
static common_sampler ** g_smpl; static common_sampler ** g_smpl;
@ -163,6 +164,8 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
@ -196,15 +199,31 @@ int main(int argc, char ** argv) {
llama_attach_threadpool(ctx, threadpool, threadpool_batch); llama_attach_threadpool(ctx, threadpool, threadpool_batch);
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
if (n_ctx > n_ctx_train) { if (n_ctx > n_ctx_train) {
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx); LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
} }
// auto enable conversation mode if chat template is available
const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
if (has_chat_template) {
LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
} else {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
}
}
// in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
if (params.conversation_mode && !has_chat_template) {
LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
}
// print chat template example in conversation mode // print chat template example in conversation mode
if (params.conversation) { if (params.conversation_mode) {
if (params.enable_chat_template) { if (params.enable_chat_template) {
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str()); LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
} else { } else {
@ -241,9 +260,9 @@ int main(int argc, char ** argv) {
} }
} }
const bool add_bos = llama_add_bos_token(model); const bool add_bos = llama_vocab_get_add_bos(vocab);
if (!llama_model_has_encoder(model)) { if (!llama_model_has_encoder(model)) {
GGML_ASSERT(!llama_add_eos_token(model)); GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
} }
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos); LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
@ -251,8 +270,10 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
{ {
auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty()) auto prompt = (params.conversation_mode && params.enable_chat_template)
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode // format the system prompt in conversation mode (fallback to default if empty)
? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
// otherwise use the prompt as is
: params.prompt; : params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG_DBG("tokenize the prompt\n"); LOG_DBG("tokenize the prompt\n");
@ -269,7 +290,7 @@ int main(int argc, char ** argv) {
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
if (add_bos) { if (add_bos) {
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_vocab_bos(vocab));
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
} else { } else {
LOG_ERR("input is empty\n"); LOG_ERR("input is empty\n");
@ -326,7 +347,7 @@ int main(int argc, char ** argv) {
params.n_keep += add_bos; // always keep the BOS token params.n_keep += add_bos; // always keep the BOS token
} }
if (params.conversation) { if (params.conversation_mode) {
params.interactive_first = true; params.interactive_first = true;
} }
@ -450,7 +471,11 @@ int main(int argc, char ** argv) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
LOG_INF( " - Press Ctrl+C to interject at any time.\n"); LOG_INF( " - Press Ctrl+C to interject at any time.\n");
#endif #endif
LOG_INF( "%s\n", control_message); LOG_INF( "%s", control_message);
if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
}
LOG_INF("\n");
is_interacting = params.interactive_first; is_interacting = params.interactive_first;
} }
@ -495,7 +520,7 @@ int main(int argc, char ** argv) {
llama_token decoder_start_token_id = llama_model_decoder_start_token(model); llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == LLAMA_TOKEN_NULL) { if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = llama_token_bos(model); decoder_start_token_id = llama_vocab_bos(vocab);
} }
embd_inp.clear(); embd_inp.clear();
@ -742,7 +767,7 @@ int main(int argc, char ** argv) {
} }
// deal with end of generation tokens in interactive mode // deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, common_sampler_last(smpl))) { if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
LOG_DBG("found an EOG token\n"); LOG_DBG("found an EOG token\n");
if (params.interactive) { if (params.interactive) {
@ -762,7 +787,7 @@ int main(int argc, char ** argv) {
} }
// if current token is not EOG, we add it to current assistant message // if current token is not EOG, we add it to current assistant message
if (params.conversation) { if (params.conversation_mode) {
const auto id = common_sampler_last(smpl); const auto id = common_sampler_last(smpl);
assistant_ss << common_token_to_piece(ctx, id, false); assistant_ss << common_token_to_piece(ctx, id, false);
} }
@ -770,17 +795,17 @@ int main(int argc, char ** argv) {
if (n_past > 0 && is_interacting) { if (n_past > 0 && is_interacting) {
LOG_DBG("waiting for user input\n"); LOG_DBG("waiting for user input\n");
if (params.conversation) { if (params.conversation_mode) {
LOG("\n> "); LOG("\n> ");
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
LOG_DBG("adding input prefix BOS token\n"); LOG_DBG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(model)); embd_inp.push_back(llama_vocab_bos(vocab));
} }
std::string buffer; std::string buffer;
if (!params.input_prefix.empty() && !params.conversation) { if (!params.input_prefix.empty() && !params.conversation_mode) {
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str()); LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
LOG("%s", params.input_prefix.c_str()); LOG("%s", params.input_prefix.c_str());
} }
@ -804,7 +829,7 @@ int main(int argc, char ** argv) {
// Entering a empty line lets the user pass control back // Entering a empty line lets the user pass control back
if (buffer.length() > 1) { if (buffer.length() > 1) {
// append input suffix if any // append input suffix if any
if (!params.input_suffix.empty() && !params.conversation) { if (!params.input_suffix.empty() && !params.conversation_mode) {
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str()); LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
LOG("%s", params.input_suffix.c_str()); LOG("%s", params.input_suffix.c_str());
} }
@ -817,7 +842,7 @@ int main(int argc, char ** argv) {
string_process_escapes(buffer); string_process_escapes(buffer);
} }
bool format_chat = params.conversation && params.enable_chat_template; bool format_chat = params.conversation_mode && params.enable_chat_template;
std::string user_inp = format_chat std::string user_inp = format_chat
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer)) ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
: std::move(buffer); : std::move(buffer);
@ -830,8 +855,8 @@ int main(int argc, char ** argv) {
// if user stop generation mid-way, we must add EOT to finish model's last response // if user stop generation mid-way, we must add EOT to finish model's last response
if (need_insert_eot && format_chat) { if (need_insert_eot && format_chat) {
llama_token eot = llama_token_eot(model); llama_token eot = llama_vocab_eot(vocab);
embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_token_eos(model) : eot); embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
need_insert_eot = false; need_insert_eot = false;
} }
@ -866,7 +891,7 @@ int main(int argc, char ** argv) {
} }
// end of generation // end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) { if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
LOG(" [end of text]\n"); LOG(" [end of text]\n");
break; break;
} }

View file

@ -135,6 +135,8 @@ int main(int argc, char ** argv) {
llama_model * model = llama_init.model.get(); llama_model * model = llama_init.model.get();
llama_context * ctx = llama_init.context.get(); llama_context * ctx = llama_init.context.get();
const llama_vocab * vocab = llama_model_get_vocab(model);
// load the prompts from an external file if there are any // load the prompts from an external file if there are any
if (params.prompt.empty()) { if (params.prompt.empty()) {
LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
@ -358,7 +360,7 @@ int main(int argc, char ** argv) {
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
if (client.n_decoded > 2 && if (client.n_decoded > 2 &&
(llama_token_is_eog(model, id) || (llama_vocab_is_eog(vocab, id) ||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
client.response.find("User:") != std::string::npos || client.response.find("User:") != std::string::npos ||
client.response.find('\n') != std::string::npos)) { client.response.find('\n') != std::string::npos)) {

View file

@ -70,15 +70,17 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
// initialize the context // initialize the context
llama_context_params ctx_params = common_context_params_to_llama(params); llama_context_params ctx_params = common_context_params_to_llama(params);
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; ctx_params.n_ctx = llama_model_n_ctx_train(model)*n_grp + n_keep;
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
LOG_ERR("%s: failed to create the llama_context\n" , __func__); LOG_ERR("%s: failed to create the llama_context\n" , __func__);
return 1; return 1;
@ -223,7 +225,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
// is it an end of generation? // is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
LOG("\n"); LOG("\n");
break; break;

View file

@ -296,8 +296,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const llama_model * model = llama_get_model(ctx);
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
LOG_INF("%s: tokenizing the input ..\n", __func__); LOG_INF("%s: tokenizing the input ..\n", __func__);
@ -338,7 +341,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_vocab_n_tokens(vocab);
int count = 0; int count = 0;
double nll = 0.0; double nll = 0.0;
@ -382,7 +385,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (add_bos && j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); tokens[batch_start] = llama_vocab_bos(vocab);
} }
const auto * batch_logits = llama_get_logits(ctx); const auto * batch_logits = llama_get_logits(ctx);
@ -444,8 +447,11 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const llama_model * model = llama_get_model(ctx);
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); const llama_vocab * vocab = llama_model_get_vocab(model);
const bool add_bos = llama_vocab_get_add_bos(vocab);
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
std::ofstream logits_stream; std::ofstream logits_stream;
if (!params.logits_file.empty()) { if (!params.logits_file.empty()) {
@ -485,7 +491,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_vocab_n_tokens(vocab);
int count = 0; int count = 0;
double nll = 0.0; double nll = 0.0;
@ -557,7 +563,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (add_bos && j == 0) { if (add_bos && j == 0) {
tokens[seq_start] = llama_token_bos(llama_get_model(ctx)); tokens[seq_start] = llama_vocab_bos(vocab);
} }
for (int k = 0; k < batch_size; ++k) { for (int k = 0; k < batch_size; ++k) {
@ -732,6 +738,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
} }
static void hellaswag_score(llama_context * ctx, const common_params & params) { static void hellaswag_score(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
// Calculates hellaswag score (acc_norm) from prompt // Calculates hellaswag score (acc_norm) from prompt
// //
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@ -765,7 +774,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
size_t hs_task_count = prompt_lines.size()/6; size_t hs_task_count = prompt_lines.size()/6;
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
LOG_INF("================================= is_spm = %d\n", is_spm); LOG_INF("================================= is_spm = %d\n", is_spm);
// The tasks should be randomized so the score stabilizes quickly. // The tasks should be randomized so the score stabilizes quickly.
@ -848,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_vocab_n_tokens(vocab);
const int max_tasks_per_batch = 32; const int max_tasks_per_batch = 32;
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@ -1072,6 +1081,8 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
* *
*/ */
static void winogrande_score(llama_context * ctx, const common_params & params) { static void winogrande_score(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
constexpr int k_min_trailing_ctx = 3; constexpr int k_min_trailing_ctx = 3;
@ -1130,7 +1141,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_vocab_n_tokens(vocab);
const int max_tasks_per_batch = 128; const int max_tasks_per_batch = 128;
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@ -1374,6 +1385,8 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
// https://huggingface.co/datasets/truthful_qa // https://huggingface.co/datasets/truthful_qa
// //
static void multiple_choice_score(llama_context * ctx, const common_params & params) { static void multiple_choice_score(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
std::istringstream strstream(params.prompt); std::istringstream strstream(params.prompt);
uint32_t n_task; uint32_t n_task;
@ -1482,7 +1495,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int n_vocab = llama_n_vocab(llama_get_model(ctx)); const int n_vocab = llama_vocab_n_tokens(vocab);
const int max_tasks_per_batch = 32; const int max_tasks_per_batch = 32;
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
@ -1655,6 +1668,9 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
} }
static void kl_divergence(llama_context * ctx, const common_params & params) { static void kl_divergence(llama_context * ctx, const common_params & params) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
if (params.logits_file.empty()) { if (params.logits_file.empty()) {
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
return; return;
@ -1688,8 +1704,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
return; return;
} }
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) { if (n_vocab != llama_vocab_n_tokens(vocab)) {
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab));
} }
std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk); std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
@ -1701,8 +1717,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
const int n_batch = params.n_batch; const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch; const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4; const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_vocab_get_add_bos(vocab);
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@ -1761,7 +1777,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (add_bos && j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); tokens[batch_start] = llama_vocab_bos(vocab);
} }
common_batch_clear(batch); common_batch_clear(batch);
@ -1995,7 +2011,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_model_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) { if (params.n_ctx > n_ctx_train) {
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",

View file

@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
auto cparams = llama_context_default_params(); auto cparams = llama_context_default_params();
cparams.n_ctx = 256; cparams.n_ctx = 256;
ctx = llama_new_context_with_model(model, cparams); ctx = llama_init_from_model(model, cparams);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());

View file

@ -47,7 +47,7 @@ echo PASS
echo echo
# 3a. Test the requanted model is loading properly # 3a. Test the requanted model is loading properly
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32 $MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
echo PASS echo PASS
echo echo
@ -57,7 +57,7 @@ echo PASS
echo echo
# 4b. Test the requanted model is loading properly # 4b. Test the requanted model is loading properly
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32 $MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
echo PASS echo PASS
echo echo

View file

@ -159,7 +159,9 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const int n_ctx_train = llama_n_ctx_train(model); const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_ctx_train = llama_model_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
@ -192,8 +194,8 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
// add eos if not present // add eos if not present
if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) { if (llama_vocab_eos(vocab) >= 0 && (inp.empty() || inp.back() != llama_vocab_eos(vocab))) {
inp.push_back(llama_token_eos(model)); inp.push_back(llama_vocab_eos(vocab));
} }
chunk.tokens = inp; chunk.tokens = inp;
} }
@ -215,7 +217,7 @@ int main(int argc, char ** argv) {
struct llama_batch batch = llama_batch_init(n_batch, 0, 1); struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
// allocate output // allocate output
const int n_embd = llama_n_embd(model); const int n_embd = llama_model_n_embd(model);
std::vector<float> embeddings(n_chunks * n_embd, 0); std::vector<float> embeddings(n_chunks * n_embd, 0);
float * emb = embeddings.data(); float * emb = embeddings.data();

View file

@ -29,7 +29,7 @@
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
[[noreturn]] static void sigint_handler(int) { [[noreturn]] static void sigint_handler(int) {
printf("\n"); printf("\n\033[0m");
exit(0); // not ideal, but it's the only way to guarantee exit in all cases exit(0); // not ideal, but it's the only way to guarantee exit in all cases
} }
#endif #endif
@ -685,7 +685,7 @@ class LlamaData {
// Initializes the context with the specified parameters // Initializes the context with the specified parameters
llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) { llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) {
llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params)); llama_context_ptr context(llama_init_from_model(model.get(), opt.ctx_params));
if (!context) { if (!context) {
printe("%s: error: failed to create the llama_context\n", __func__); printe("%s: error: failed to create the llama_context\n", __func__);
} }
@ -713,11 +713,11 @@ static void add_message(const char * role, const std::string & text, LlamaData &
// Function to apply the chat template and resize `formatted` if needed // Function to apply the chat template and resize `formatted` if needed
static int apply_chat_template(LlamaData & llama_data, const bool append) { static int apply_chat_template(LlamaData & llama_data, const bool append) {
int result = llama_chat_apply_template( int result = llama_chat_apply_template(
llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append, llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(), llama_data.messages.size(), append,
append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0); append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
if (append && result > static_cast<int>(llama_data.fmtted.size())) { if (append && result > static_cast<int>(llama_data.fmtted.size())) {
llama_data.fmtted.resize(result); llama_data.fmtted.resize(result);
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), result = llama_chat_apply_template(llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(),
llama_data.messages.size(), append, llama_data.fmtted.data(), llama_data.messages.size(), append, llama_data.fmtted.data(),
llama_data.fmtted.size()); llama_data.fmtted.size());
} }
@ -726,11 +726,11 @@ static int apply_chat_template(LlamaData & llama_data, const bool append) {
} }
// Function to tokenize the prompt // Function to tokenize the prompt
static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt, static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
std::vector<llama_token> & prompt_tokens) { std::vector<llama_token> & prompt_tokens) {
const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true); const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
prompt_tokens.resize(n_prompt_tokens); prompt_tokens.resize(n_prompt_tokens);
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
true) < 0) { true) < 0) {
printe("failed to tokenize the prompt\n"); printe("failed to tokenize the prompt\n");
return -1; return -1;
@ -753,9 +753,9 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
} }
// convert the token to a string // convert the token to a string
static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) { static int convert_token_to_string(const llama_vocab * vocab, const llama_token token_id, std::string & piece) {
char buf[256]; char buf[256];
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); int n = llama_token_to_piece(vocab, token_id, buf, sizeof(buf), 0, true);
if (n < 0) { if (n < 0) {
printe("failed to convert token to piece\n"); printe("failed to convert token to piece\n");
return 1; return 1;
@ -773,8 +773,10 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
// helper function to evaluate a prompt and generate a response // helper function to evaluate a prompt and generate a response
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) { if (tokenize_prompt(vocab, prompt, tokens) < 0) {
return 1; return 1;
} }
@ -790,12 +792,12 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
// sample the next token, check is it an end of generation? // sample the next token, check is it an end of generation?
new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1); new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
if (llama_token_is_eog(llama_data.model.get(), new_token_id)) { if (llama_vocab_is_eog(vocab, new_token_id)) {
break; break;
} }
std::string piece; std::string piece;
if (convert_token_to_string(llama_data.model, new_token_id, piece)) { if (convert_token_to_string(vocab, new_token_id, piece)) {
return 1; return 1;
} }

View file

@ -97,7 +97,7 @@ int main(int argc, char ** argv) {
printf("\n\n"); printf("\n\n");
// make new context // make new context
llama_context * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params)); llama_context * ctx2 = llama_init_from_model(model, common_context_params_to_llama(params));
llama_sampler * smpl2 = llama_sampler_chain_init(sparams); llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
@ -154,7 +154,7 @@ int main(int argc, char ** argv) {
} }
// make new context // make new context
llama_context * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params)); llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params));
llama_sampler * smpl3 = llama_sampler_chain_init(sparams); llama_sampler * smpl3 = llama_sampler_chain_init(sparams);

Binary file not shown.

View file

@ -98,7 +98,7 @@ struct slot_params {
int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<common_lora_adapter_info> lora; std::vector<common_adapter_lora_info> lora;
std::vector<std::string> antiprompt; std::vector<std::string> antiprompt;
std::vector<std::string> response_fields; std::vector<std::string> response_fields;
@ -198,15 +198,17 @@ struct server_task {
bool metrics_reset_bucket = false; bool metrics_reset_bucket = false;
// used by SERVER_TASK_TYPE_SET_LORA // used by SERVER_TASK_TYPE_SET_LORA
std::vector<common_lora_adapter_info> set_lora; std::vector<common_adapter_lora_info> set_lora;
server_task(server_task_type type) : type(type) {} server_task(server_task_type type) : type(type) {}
static slot_params params_from_json_cmpl( static slot_params params_from_json_cmpl(
const llama_model * model,
const llama_context * ctx, const llama_context * ctx,
const common_params & params_base, const common_params & params_base,
const json & data) { const json & data) {
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
slot_params params; slot_params params;
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
@ -329,7 +331,7 @@ struct server_task {
const auto & logit_bias = data.find("logit_bias"); const auto & logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array()) { if (logit_bias != data.end() && logit_bias->is_array()) {
const int n_vocab = llama_n_vocab(model); const int n_vocab = llama_vocab_n_tokens(vocab);
for (const auto & el : *logit_bias) { for (const auto & el : *logit_bias) {
// TODO: we may want to throw errors here, in case "el" is incorrect // TODO: we may want to throw errors here, in case "el" is incorrect
if (el.is_array() && el.size() == 2) { if (el.is_array() && el.size() == 2) {
@ -348,7 +350,7 @@ struct server_task {
params.sampling.logit_bias.push_back({tok, bias}); params.sampling.logit_bias.push_back({tok, bias});
} }
} else if (el[0].is_string()) { } else if (el[0].is_string()) {
auto toks = common_tokenize(model, el[0].get<std::string>(), false); auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
for (auto tok : toks) { for (auto tok : toks) {
params.sampling.logit_bias.push_back({tok, bias}); params.sampling.logit_bias.push_back({tok, bias});
} }
@ -1131,7 +1133,7 @@ struct server_slot {
common_speculative * spec = nullptr; common_speculative * spec = nullptr;
std::vector<common_lora_adapter_info> lora; std::vector<common_adapter_lora_info> lora;
// the index relative to completion multi-task request // the index relative to completion multi-task request
size_t index = 0; size_t index = 0;
@ -1633,6 +1635,8 @@ struct server_context {
llama_model * model = nullptr; llama_model * model = nullptr;
llama_context * ctx = nullptr; llama_context * ctx = nullptr;
const llama_vocab * vocab = nullptr;
llama_model * model_dft = nullptr; llama_model * model_dft = nullptr;
llama_context_params cparams_dft; llama_context_params cparams_dft;
@ -1690,10 +1694,12 @@ struct server_context {
return false; return false;
} }
vocab = llama_model_get_vocab(model);
n_ctx = llama_n_ctx(ctx); n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_add_bos_token(model); add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL; has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
if (!params_base.speculative.model.empty()) { if (!params_base.speculative.model.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
@ -1736,7 +1742,8 @@ struct server_context {
bool validate_builtin_chat_template() const { bool validate_builtin_chat_template() const {
llama_chat_message chat[] = {{"user", "test"}}; llama_chat_message chat[] = {{"user", "test"}};
int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0); const char * tmpl = llama_model_chat_template(model);
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0; return chat_res > 0;
} }
@ -1756,7 +1763,7 @@ struct server_context {
if (model_dft) { if (model_dft) {
slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
if (slot.ctx_dft == nullptr) { if (slot.ctx_dft == nullptr) {
SRV_ERR("%s", "failed to create draft context\n"); SRV_ERR("%s", "failed to create draft context\n");
return; return;
@ -1891,7 +1898,7 @@ struct server_context {
} }
if (slot.params.ignore_eos && has_eos_token) { if (slot.params.ignore_eos && has_eos_token) {
slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
} }
{ {
@ -2047,14 +2054,14 @@ struct server_context {
slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
} }
if (llama_token_is_eog(model, result.tok)) { if (llama_vocab_is_eog(vocab, result.tok)) {
slot.stop = STOP_TYPE_EOS; slot.stop = STOP_TYPE_EOS;
slot.has_next_token = false; slot.has_next_token = false;
SLT_DBG(slot, "%s", "stopped by EOS\n"); SLT_DBG(slot, "%s", "stopped by EOS\n");
} }
const auto n_ctx_train = llama_n_ctx_train(model); const auto n_ctx_train = llama_model_n_ctx_train(model);
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
slot.truncated = true; slot.truncated = true;
@ -2074,7 +2081,7 @@ struct server_context {
void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) { void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) {
size_t n_probs = slot.params.sampling.n_probs; size_t n_probs = slot.params.sampling.n_probs;
size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); size_t n_vocab = llama_vocab_n_tokens(vocab);
if (post_sampling) { if (post_sampling) {
const auto * cur_p = common_sampler_get_candidates(slot.smpl); const auto * cur_p = common_sampler_get_candidates(slot.smpl);
const size_t max_probs = cur_p->size; const size_t max_probs = cur_p->size;
@ -2225,7 +2232,7 @@ struct server_context {
res->n_tokens = slot.n_prompt_tokens; res->n_tokens = slot.n_prompt_tokens;
res->oaicompat = slot.params.oaicompat; res->oaicompat = slot.params.oaicompat;
const int n_embd = llama_n_embd(model); const int n_embd = llama_model_n_embd(model);
std::vector<float> embd_res(n_embd, 0.0f); std::vector<float> embd_res(n_embd, 0.0f);
@ -2927,7 +2934,7 @@ struct server_context {
// make sure we're in the right embedding mode // make sure we're in the right embedding mode
llama_set_embeddings(ctx, slot_batched->is_non_causal()); llama_set_embeddings(ctx, slot_batched->is_non_causal());
// apply lora, only need to do it once per batch // apply lora, only need to do it once per batch
common_lora_adapters_apply(ctx, slot_batched->lora); common_set_adapter_lora(ctx, slot_batched->lora);
} }
// process the created batch of tokens // process the created batch of tokens
@ -3129,11 +3136,11 @@ struct server_context {
json model_meta() const { json model_meta() const {
return json { return json {
{"vocab_type", llama_vocab_type (model)}, {"vocab_type", llama_vocab_type (vocab)},
{"n_vocab", llama_n_vocab (model)}, {"n_vocab", llama_vocab_n_tokens (vocab)},
{"n_ctx_train", llama_n_ctx_train (model)}, {"n_ctx_train", llama_model_n_ctx_train(model)},
{"n_embd", llama_n_embd (model)}, {"n_embd", llama_model_n_embd (model)},
{"n_params", llama_model_n_params(model)}, {"n_params", llama_model_n_params (model)},
{"size", llama_model_size (model)}, {"size", llama_model_size (model)},
}; };
} }
@ -3639,7 +3646,7 @@ int main(int argc, char ** argv) {
std::vector<server_task> tasks; std::vector<server_task> tasks;
try { try {
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true); std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true);
tasks.reserve(tokenized_prompts.size()); tasks.reserve(tokenized_prompts.size());
for (size_t i = 0; i < tokenized_prompts.size(); i++) { for (size_t i = 0; i < tokenized_prompts.size(); i++) {
server_task task = server_task(type); server_task task = server_task(type);
@ -3649,7 +3656,6 @@ int main(int argc, char ** argv) {
task.prompt_tokens = std::move(tokenized_prompts[i]); task.prompt_tokens = std::move(tokenized_prompts[i]);
task.params = server_task::params_from_json_cmpl( task.params = server_task::params_from_json_cmpl(
ctx_server.model,
ctx_server.ctx, ctx_server.ctx,
ctx_server.params_base, ctx_server.params_base,
data); data);
@ -3745,13 +3751,13 @@ int main(int argc, char ** argv) {
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
// check model compatibility // check model compatibility
std::string err; std::string err;
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
err += "prefix token is missing. "; err += "prefix token is missing. ";
} }
if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) { if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
err += "suffix token is missing. "; err += "suffix token is missing. ";
} }
if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
err += "middle token is missing. "; err += "middle token is missing. ";
} }
if (!err.empty()) { if (!err.empty()) {
@ -3797,10 +3803,10 @@ int main(int argc, char ** argv) {
data["input_extra"] = input_extra; // default to empty array if it's not exist data["input_extra"] = input_extra; // default to empty array if it's not exist
std::string prompt = json_value(data, "prompt", std::string()); std::string prompt = json_value(data, "prompt", std::string());
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true); std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true);
SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
data["prompt"] = format_infill( data["prompt"] = format_infill(
ctx_server.ctx, ctx_server.vocab,
data.at("input_prefix"), data.at("input_prefix"),
data.at("input_suffix"), data.at("input_suffix"),
data.at("input_extra"), data.at("input_extra"),
@ -3857,7 +3863,7 @@ int main(int argc, char ** argv) {
const bool add_special = json_value(body, "add_special", false); const bool add_special = json_value(body, "add_special", false);
const bool with_pieces = json_value(body, "with_pieces", false); const bool with_pieces = json_value(body, "with_pieces", false);
llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true); llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true);
if (with_pieces) { if (with_pieces) {
for (const auto& token : tokens) { for (const auto& token : tokens) {
@ -3933,7 +3939,7 @@ int main(int argc, char ** argv) {
} }
} }
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true);
for (const auto & tokens : tokenized_prompts) { for (const auto & tokens : tokenized_prompts) {
// this check is necessary for models that do not add BOS token to the input // this check is necessary for models that do not add BOS token to the input
if (tokens.empty()) { if (tokens.empty()) {
@ -4033,20 +4039,20 @@ int main(int argc, char ** argv) {
return; return;
} }
llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0]; llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0];
// create and queue the task // create and queue the task
json responses = json::array(); json responses = json::array();
bool error = false; bool error = false;
{ {
std::vector<server_task> tasks; std::vector<server_task> tasks;
std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true); std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true);
tasks.reserve(tokenized_docs.size()); tasks.reserve(tokenized_docs.size());
for (size_t i = 0; i < tokenized_docs.size(); i++) { for (size_t i = 0; i < tokenized_docs.size(); i++) {
server_task task = server_task(SERVER_TASK_TYPE_RERANK); server_task task = server_task(SERVER_TASK_TYPE_RERANK);
task.id = ctx_server.queue_tasks.get_new_id(); task.id = ctx_server.queue_tasks.get_new_id();
task.index = i; task.index = i;
task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]); task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]);
tasks.push_back(task); tasks.push_back(task);
} }

View file

@ -118,7 +118,7 @@ static json json_get_nested_values(const std::vector<std::string> & paths, const
* - only string, example: "string" * - only string, example: "string"
* - mixed string and tokens, example: [12, 34, "string", 56, 78] * - mixed string and tokens, example: [12, 34, "string", 56, 78]
*/ */
static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
// If `add_bos` is true, we only add BOS, when json_prompt is a string, // If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string. // or the first element of the json_prompt array is a string.
llama_tokens prompt_tokens; llama_tokens prompt_tokens;
@ -131,10 +131,10 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
llama_tokens p; llama_tokens p;
if (first) { if (first) {
p = common_tokenize(ctx, s, add_special, parse_special); p = common_tokenize(vocab, s, add_special, parse_special);
first = false; first = false;
} else { } else {
p = common_tokenize(ctx, s, false, parse_special); p = common_tokenize(vocab, s, false, parse_special);
} }
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
@ -148,7 +148,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
} }
} else { } else {
auto s = json_prompt.template get<std::string>(); auto s = json_prompt.template get<std::string>();
prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
} }
return prompt_tokens; return prompt_tokens;
@ -166,11 +166,11 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_
* - "prompt": [[12, 34, 56], [78, 90, 12]] * - "prompt": [[12, 34, 56], [78, 90, 12]]
* - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
*/ */
static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
std::vector<llama_tokens> result; std::vector<llama_tokens> result;
if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
// string or mixed // string or mixed
result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special)); result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
} else if (json_is_array_of_numbers(json_prompt)) { } else if (json_is_array_of_numbers(json_prompt)) {
// array of tokens // array of tokens
result.push_back(json_prompt.get<llama_tokens>()); result.push_back(json_prompt.get<llama_tokens>());
@ -179,7 +179,7 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
result.reserve(json_prompt.size()); result.reserve(json_prompt.size());
for (const auto & p : json_prompt) { for (const auto & p : json_prompt) {
if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) { if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
result.push_back(tokenize_mixed(ctx, p, add_special, parse_special)); result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
} else if (json_is_array_of_numbers(p)) { } else if (json_is_array_of_numbers(p)) {
// array of tokens // array of tokens
result.push_back(p.get<llama_tokens>()); result.push_back(p.get<llama_tokens>());
@ -231,21 +231,23 @@ static size_t validate_utf8(const std::string& text) {
// //
// format rerank task: [BOS]query[EOS][SEP]doc[EOS] // format rerank task: [BOS]query[EOS][SEP]doc[EOS]
static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) { static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
llama_tokens result; llama_tokens result;
result.reserve(doc.size() + query.size() + 4); result.reserve(doc.size() + query.size() + 4);
result.push_back(llama_token_bos(model)); result.push_back(llama_vocab_bos(vocab));
result.insert(result.end(), query.begin(), query.end()); result.insert(result.end(), query.begin(), query.end());
result.push_back(llama_token_eos(model)); result.push_back(llama_vocab_eos(vocab));
result.push_back(llama_token_sep(model)); result.push_back(llama_vocab_sep(vocab));
result.insert(result.end(), doc.begin(), doc.end()); result.insert(result.end(), doc.begin(), doc.end());
result.push_back(llama_token_eos(model)); result.push_back(llama_vocab_eos(vocab));
return result; return result;
} }
// format infill task // format infill task
static llama_tokens format_infill( static llama_tokens format_infill(
const llama_context * ctx, const llama_vocab * vocab,
const json & input_prefix, const json & input_prefix,
const json & input_suffix, const json & input_suffix,
const json & input_extra, const json & input_extra,
@ -272,15 +274,14 @@ static llama_tokens format_infill(
llama_tokens extra_tokens; llama_tokens extra_tokens;
extra_tokens.reserve(n_ctx); extra_tokens.reserve(n_ctx);
auto model = llama_get_model(ctx); auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false);
auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false); auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false);
auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false);
if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) {
// TODO: make project name an input // TODO: make project name an input
static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false); static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false);
extra_tokens.push_back(llama_token_fim_rep(model)); extra_tokens.push_back(llama_vocab_fim_rep(vocab));
extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
} }
for (const auto & chunk : input_extra) { for (const auto & chunk : input_extra) {
@ -288,28 +289,28 @@ static llama_tokens format_infill(
const std::string text = json_value(chunk, "text", std::string()); const std::string text = json_value(chunk, "text", std::string());
const std::string filename = json_value(chunk, "filename", std::string("tmp")); const std::string filename = json_value(chunk, "filename", std::string("tmp"));
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false); const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false);
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
} else { } else {
// chunk separator in binary form to avoid confusing the AI // chunk separator in binary form to avoid confusing the AI
static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false); static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false);
extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
} }
const auto chunk_tokens = common_tokenize(ctx, text, false, false); const auto chunk_tokens = common_tokenize(vocab, text, false, false);
extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
} }
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) {
// TODO: current filename // TODO: current filename
static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false); static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false);
extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab));
extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
} }
@ -325,15 +326,15 @@ static llama_tokens format_infill(
tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
tokens_suffix.resize(n_suffix_take); tokens_suffix.resize(n_suffix_take);
tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab));
tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab));
auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix; auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix;
auto embd_end = spm_infill ? tokens_prefix : tokens_suffix; auto embd_end = spm_infill ? tokens_prefix : tokens_suffix;
if (llama_add_bos_token(model)) { if (llama_vocab_get_add_bos(vocab)) {
embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
} }
SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size());
@ -342,7 +343,7 @@ static llama_tokens format_infill(
embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end());
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
embd_inp.push_back(llama_token_fim_mid(model)); embd_inp.push_back(llama_vocab_fim_mid(vocab));
return embd_inp; return embd_inp;
} }
@ -764,14 +765,18 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
return data; return data;
} }
static std::string safe_json_to_str(json data) { static std::string safe_json_to_str(const json & data) {
return data.dump(-1, ' ', false, json::error_handler_t::replace); return data.dump(-1, ' ', false, json::error_handler_t::replace);
} }
static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) { static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
std::vector<llama_token_data> cur; std::vector<llama_token_data> cur;
const auto * logits = llama_get_logits_ith(ctx, idx); const auto * logits = llama_get_logits_ith(ctx, idx);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const llama_model * model = llama_get_model(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
cur.resize(n_vocab); cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) { for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -799,8 +804,8 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
} }
static bool are_lora_equal( static bool are_lora_equal(
const std::vector<common_lora_adapter_info> & l1, const std::vector<common_adapter_lora_info> & l1,
const std::vector<common_lora_adapter_info> & l2) { const std::vector<common_adapter_lora_info> & l2) {
if (l1.size() != l2.size()) { if (l1.size() != l2.size()) {
return false; return false;
} }
@ -814,10 +819,10 @@ static bool are_lora_equal(
} }
// parse lora config from JSON request, returned a copy of lora_base with updated scale // parse lora config from JSON request, returned a copy of lora_base with updated scale
static std::vector<common_lora_adapter_info> parse_lora_request( static std::vector<common_adapter_lora_info> parse_lora_request(
const std::vector<common_lora_adapter_info> & lora_base, const std::vector<common_adapter_lora_info> & lora_base,
const json & data) { const json & data) {
std::vector<common_lora_adapter_info> lora(lora_base); std::vector<common_adapter_lora_info> lora(lora_base);
int max_idx = lora.size(); int max_idx = lora.size();
// clear existing value // clear existing value

View file

@ -37,7 +37,7 @@
<div v-for="conv in conversations" :class="{ <div v-for="conv in conversations" :class="{
'btn btn-ghost justify-start font-normal': true, 'btn btn-ghost justify-start font-normal': true,
'btn-active': conv.id === viewingConvId, 'btn-active': conv.id === viewingConvId,
}" @click="setViewingConv(conv.id)"> }" @click="setViewingConv(conv.id)" dir="auto">
<span class="truncate">{{ conv.messages[0].content }}</span> <span class="truncate">{{ conv.messages[0].content }}</span>
</div> </div>
<div class="text-center text-xs opacity-40 mt-auto mx-4"> <div class="text-center text-xs opacity-40 mt-auto mx-4">
@ -156,6 +156,7 @@
@keydown.enter.shift.exact.prevent="inputMsg += '\n'" @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
:disabled="isGenerating" :disabled="isGenerating"
id="msg-input" id="msg-input"
dir="auto"
></textarea> ></textarea>
<button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button> <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
<button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button> <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
@ -248,6 +249,7 @@
<!-- textarea for editing message --> <!-- textarea for editing message -->
<template v-if="editingContent !== null"> <template v-if="editingContent !== null">
<textarea <textarea
dir="auto"
class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96" class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
v-model="editingContent"></textarea> v-model="editingContent"></textarea>
<br/> <br/>
@ -258,7 +260,9 @@
<!-- show loading dots for pending message --> <!-- show loading dots for pending message -->
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span> <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
<!-- render message as markdown --> <!-- render message as markdown -->
<vue-markdown v-else :source="msg.content"></vue-markdown> <div v-else dir="auto">
<vue-markdown :source="msg.content"></vue-markdown>
</div>
<!-- render timings if enabled --> <!-- render timings if enabled -->
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond"> <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
<div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div> <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>

View file

@ -111,12 +111,12 @@ const VueMarkdown = defineComponent(
highlight: function (str, lang) { // Add highlight.js highlight: function (str, lang) { // Add highlight.js
if (lang && hljs.getLanguage(lang)) { if (lang && hljs.getLanguage(lang)) {
try { try {
return '<pre><code class="hljs">' + return '<pre dir="auto"><code class="hljs">' +
hljs.highlight(str, { language: lang, ignoreIllegals: true }).value + hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
'</code></pre>'; '</code></pre>';
} catch (__) {} } catch (__) {}
} }
return '<pre><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>'; return '<pre dir="auto"><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
} }
})); }));
// support latex with double dollar sign and square brackets // support latex with double dollar sign and square brackets

View file

@ -75,12 +75,14 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
// initialize the context // initialize the context
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = n_ctx; ctx_params.n_ctx = n_ctx;
ctx_params.n_batch = n_ctx; ctx_params.n_batch = n_ctx;
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
if (!ctx) { if (!ctx) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
return 1; return 1;
@ -97,9 +99,9 @@ int main(int argc, char ** argv) {
std::string response; std::string response;
// tokenize the prompt // tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true); const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
std::vector<llama_token> prompt_tokens(n_prompt_tokens); std::vector<llama_token> prompt_tokens(n_prompt_tokens);
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) { if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
GGML_ABORT("failed to tokenize the prompt\n"); GGML_ABORT("failed to tokenize the prompt\n");
} }
@ -124,13 +126,13 @@ int main(int argc, char ** argv) {
new_token_id = llama_sampler_sample(smpl, ctx, -1); new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation? // is it an end of generation?
if (llama_token_is_eog(model, new_token_id)) { if (llama_vocab_is_eog(vocab, new_token_id)) {
break; break;
} }
// convert the token to a string, print it and add it to the response // convert the token to a string, print it and add it to the response
char buf[256]; char buf[256];
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true); int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
if (n < 0) { if (n < 0) {
GGML_ABORT("failed to convert token to piece\n"); GGML_ABORT("failed to convert token to piece\n");
} }
@ -159,12 +161,14 @@ int main(int argc, char ** argv) {
break; break;
} }
const char * tmpl = llama_model_chat_template(model);
// add the user input to the message list and format it // add the user input to the message list and format it
messages.push_back({"user", strdup(user.c_str())}); messages.push_back({"user", strdup(user.c_str())});
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size()); int new_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, formatted.data(), formatted.size());
if (new_len > (int)formatted.size()) { if (new_len > (int)formatted.size()) {
formatted.resize(new_len); formatted.resize(new_len);
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size()); new_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), true, formatted.data(), formatted.size());
} }
if (new_len < 0) { if (new_len < 0) {
fprintf(stderr, "failed to apply the chat template\n"); fprintf(stderr, "failed to apply the chat template\n");
@ -181,7 +185,7 @@ int main(int argc, char ** argv) {
// add the response to the messages // add the response to the messages
messages.push_back({"assistant", strdup(response.c_str())}); messages.push_back({"assistant", strdup(response.c_str())});
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0); prev_len = llama_chat_apply_template(tmpl, messages.data(), messages.size(), false, nullptr, 0);
if (prev_len < 0) { if (prev_len < 0) {
fprintf(stderr, "failed to apply the chat template\n"); fprintf(stderr, "failed to apply the chat template\n");
return 1; return 1;

View file

@ -84,6 +84,7 @@ int main(int argc, char ** argv) {
model_params.n_gpu_layers = ngl; model_params.n_gpu_layers = ngl;
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params); llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
const llama_vocab * vocab = llama_model_get_vocab(model);
if (model == NULL) { if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__); fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@ -93,11 +94,11 @@ int main(int argc, char ** argv) {
// tokenize the prompt // tokenize the prompt
// find the number of tokens in the prompt // find the number of tokens in the prompt
const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true); const int n_prompt = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
// allocate space for the tokens and tokenize the prompt // allocate space for the tokens and tokenize the prompt
std::vector<llama_token> prompt_tokens(n_prompt); std::vector<llama_token> prompt_tokens(n_prompt);
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) { if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__); fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
return 1; return 1;
} }
@ -112,7 +113,7 @@ int main(int argc, char ** argv) {
// enable performance counters // enable performance counters
ctx_params.no_perf = false; ctx_params.no_perf = false;
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
@ -131,7 +132,7 @@ int main(int argc, char ** argv) {
for (auto id : prompt_tokens) { for (auto id : prompt_tokens) {
char buf[128]; char buf[128];
int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true); int n = llama_token_to_piece(vocab, id, buf, sizeof(buf), 0, true);
if (n < 0) { if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return 1; return 1;
@ -164,12 +165,12 @@ int main(int argc, char ** argv) {
new_token_id = llama_sampler_sample(smpl, ctx, -1); new_token_id = llama_sampler_sample(smpl, ctx, -1);
// is it an end of generation? // is it an end of generation?
if (llama_token_is_eog(model, new_token_id)) { if (llama_vocab_is_eog(vocab, new_token_id)) {
break; break;
} }
char buf[128]; char buf[128];
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true); int n = llama_token_to_piece(vocab, new_token_id, buf, sizeof(buf), 0, true);
if (n < 0) { if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return 1; return 1;

View file

@ -45,6 +45,8 @@ int main(int argc, char ** argv) {
model_tgt = llama_init_tgt.model.get(); model_tgt = llama_init_tgt.model.get();
ctx_tgt = llama_init_tgt.context.get(); ctx_tgt = llama_init_tgt.context.get();
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
// load the draft model // load the draft model
params.devices = params.speculative.devices; params.devices = params.speculative.devices;
params.model = params.speculative.model; params.model = params.speculative.model;
@ -196,7 +198,7 @@ int main(int argc, char ** argv) {
id_last = ids[i]; id_last = ids[i];
if (llama_token_is_eog(model_tgt, id_last)) { if (llama_vocab_is_eog(vocab, id_last)) {
has_eos = true; has_eos = true;
break; break;
} }

View file

@ -90,10 +90,13 @@ int main(int argc, char ** argv) {
model_dft = llama_init_dft.model.get(); model_dft = llama_init_dft.model.get();
ctx_dft = llama_init_dft.context.get(); ctx_dft = llama_init_dft.context.get();
const bool vocab_type_tgt = llama_vocab_type(model_tgt); const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt); LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
const bool vocab_type_dft = llama_vocab_type(model_dft); const bool vocab_type_dft = llama_vocab_type(vocab_dft);
LOG_DBG("vocab_type dft: %d\n", vocab_type_dft); LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
if (vocab_type_tgt != vocab_type_dft) { if (vocab_type_tgt != vocab_type_dft) {
@ -103,18 +106,18 @@ int main(int argc, char ** argv) {
} }
if ( if (
llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) || llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
llama_token_eos(model_tgt) != llama_token_eos(model_dft) llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
) { ) {
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
return 1; return 1;
} }
{ {
const int n_vocab_tgt = llama_n_vocab(model_tgt); const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
const int n_vocab_dft = llama_n_vocab(model_dft); const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
const int vocab_diff = n_vocab_tgt > n_vocab_dft const int vocab_diff = n_vocab_tgt > n_vocab_dft
? n_vocab_tgt - n_vocab_dft ? n_vocab_tgt - n_vocab_dft
: n_vocab_dft - n_vocab_tgt; : n_vocab_dft - n_vocab_tgt;
@ -122,13 +125,13 @@ int main(int argc, char ** argv) {
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__); LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
return 1; return 1;
} }
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
const char * token_text_tgt = llama_token_get_text(model_tgt, i); const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
const char * token_text_dft = llama_token_get_text(model_dft, i); const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
if (std::strcmp(token_text_tgt, token_text_dft) != 0) { if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__); LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i, LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
@ -170,7 +173,7 @@ int main(int argc, char ** argv) {
const auto t_enc_end = ggml_time_us(); const auto t_enc_end = ggml_time_us();
// the 2 models should have the same vocab // the 2 models should have the same vocab
//GGML_ASSERT(n_vocab == llama_n_vocab(model_dft)); //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft));
// how many tokens to draft each time // how many tokens to draft each time
int n_draft = params.speculative.n_max; int n_draft = params.speculative.n_max;
@ -386,7 +389,7 @@ int main(int argc, char ** argv) {
} }
} }
if (llama_token_is_eog(model_tgt, token_id)) { if (llama_vocab_is_eog(vocab_tgt, token_id)) {
has_eos = true; has_eos = true;
} }
++n_predict; ++n_predict;

View file

@ -344,8 +344,10 @@ int main(int raw_argc, char ** raw_argv) {
return 1; return 1;
} }
const llama_vocab * vocab = llama_model_get_vocab(model);
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_init_from_model(model, ctx_params);
if (!ctx) { if (!ctx) {
fprintf(stderr, "Error: could not create context.\n"); fprintf(stderr, "Error: could not create context.\n");
return 1; return 1;
@ -365,7 +367,7 @@ int main(int raw_argc, char ** raw_argv) {
prompt = stdin_buffer.str(); prompt = stdin_buffer.str();
} }
const bool model_wants_add_bos = llama_add_bos_token(model); const bool model_wants_add_bos = llama_vocab_get_add_bos(vocab);
const bool add_bos = model_wants_add_bos && !no_bos; const bool add_bos = model_wants_add_bos && !no_bos;
const bool parse_special = !no_parse_special; const bool parse_special = !no_parse_special;
const bool escape = !no_escape; const bool escape = !no_escape;
@ -375,7 +377,7 @@ int main(int raw_argc, char ** raw_argv) {
} }
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
tokens = common_tokenize(model, prompt, add_bos, parse_special); tokens = common_tokenize(vocab, prompt, add_bos, parse_special);
if (printing_ids) { if (printing_ids) {
printf("["); printf("[");

117
examples/tts/README.md Normal file
View file

@ -0,0 +1,117 @@
# llama.cpp/example/tts
This example demonstrates the Text To Speech feature. It uses a
[model](https://www.outeai.com/blog/outetts-0.2-500m) from
[outeai](https://www.outeai.com/).
## Quickstart
If you have built llama.cpp with `-DLLAMA_CURL=ON` you can simply run the
following command and the required models will be downloaded automatically:
```console
$ build/bin/llama-tts --tts-oute-default -p "Hello world" && aplay output.wav
```
For details about the models and how to convert them to the required format
see the following sections.
### Model conversion
Checkout or download the model that contains the LLM model:
```console
$ pushd models
$ git clone --branch main --single-branch --depth 1 https://huggingface.co/OuteAI/OuteTTS-0.2-500M
$ cd OuteTTS-0.2-500M && git lfs install && git lfs pull
$ popd
```
Convert the model to .gguf format:
```console
(venv) python convert_hf_to_gguf.py models/OuteTTS-0.2-500M \
--outfile models/outetts-0.2-0.5B-f16.gguf --outtype f16
```
The generated model will be `models/outetts-0.2-0.5B-f16.gguf`.
We can optionally quantize this to Q8_0 using the following command:
```console
$ build/bin/llama-quantize models/outetts-0.2-0.5B-f16.gguf \
models/outetts-0.2-0.5B-q8_0.gguf q8_0
```
The quantized model will be `models/outetts-0.2-0.5B-q8_0.gguf`.
Next we do something simlar for the audio decoder. First download or checkout
the model for the voice decoder:
```console
$ pushd models
$ git clone --branch main --single-branch --depth 1 https://huggingface.co/novateur/WavTokenizer-large-speech-75token
$ cd WavTokenizer-large-speech-75token && git lfs install && git lfs pull
$ popd
```
This model file is PyTorch checkpoint (.ckpt) and we first need to convert it to
huggingface format:
```console
(venv) python examples/tts/convert_pt_to_hf.py \
models/WavTokenizer-large-speech-75token/wavtokenizer_large_speech_320_24k.ckpt
...
Model has been successfully converted and saved to models/WavTokenizer-large-speech-75token/model.safetensors
Metadata has been saved to models/WavTokenizer-large-speech-75token/index.json
Config has been saved to models/WavTokenizer-large-speech-75tokenconfig.json
```
Then we can convert the huggingface format to gguf:
```console
(venv) python convert_hf_to_gguf.py models/WavTokenizer-large-speech-75token \
--outfile models/wavtokenizer-large-75-f16.gguf --outtype f16
...
INFO:hf-to-gguf:Model successfully exported to models/wavtokenizer-large-75-f16.gguf
```
### Running the example
With both of the models generated, the LLM model and the voice decoder model,
we can run the example:
```console
$ build/bin/llama-tts -m ./models/outetts-0.2-0.5B-q8_0.gguf \
-mv ./models/wavtokenizer-large-75-f16.gguf \
-p "Hello world"
...
main: audio written to file 'output.wav'
```
The output.wav file will contain the audio of the prompt. This can be heard
by playing the file with a media player. On Linux the following command will
play the audio:
```console
$ aplay output.wav
```
### Running the example with llama-server
Running this example with `llama-server` is also possible and requires two
server instances to be started. One will serve the LLM model and the other
will serve the voice decoder model.
The LLM model server can be started with the following command:
```console
$ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020
```
And the voice decoder model server can be started using:
```console
./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none
```
Then we can run [tts-outetts.py](tts-outetts.py) to generate the audio.
First create a virtual environment for python and install the required
dependencies (this in only required to be done once):
```console
$ python3 -m venv venv
$ source venv/bin/activate
(venv) pip install requests numpy
```
And then run the python script using:
```conole
(venv) python ./examples/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world"
spectrogram generated: n_codes: 90, n_embd: 1282
converting to audio ...
audio generated: 28800 samples
audio written to file "output.wav"
```
And to play the audio we can again use aplay or any other media player:
```console
$ aplay output.wav
```

View file

@ -3,6 +3,121 @@ import sys
#import struct #import struct
import requests import requests
import re import re
import struct
import numpy as np
from concurrent.futures import ThreadPoolExecutor
def fill_hann_window(size, periodic=True):
if periodic:
return np.hanning(size + 1)[:-1]
return np.hanning(size)
def irfft(n_fft, complex_input):
return np.fft.irfft(complex_input, n=n_fft)
def fold(buffer, n_out, n_win, n_hop, n_pad):
result = np.zeros(n_out)
n_frames = len(buffer) // n_win
for i in range(n_frames):
start = i * n_hop
end = start + n_win
result[start:end] += buffer[i * n_win:(i + 1) * n_win]
return result[n_pad:-n_pad] if n_pad > 0 else result
def process_frame(args):
l, n_fft, ST, hann = args
frame = irfft(n_fft, ST[l])
frame = frame * hann
hann2 = hann * hann
return frame, hann2
def embd_to_audio(embd, n_codes, n_embd, n_thread=4):
embd = np.asarray(embd, dtype=np.float32).reshape(n_codes, n_embd)
n_fft = 1280
n_hop = 320
n_win = 1280
n_pad = (n_win - n_hop) // 2
n_out = (n_codes - 1) * n_hop + n_win
hann = fill_hann_window(n_fft, True)
E = np.zeros((n_embd, n_codes), dtype=np.float32)
for l in range(n_codes):
for k in range(n_embd):
E[k, l] = embd[l, k]
half_embd = n_embd // 2
S = np.zeros((n_codes, half_embd + 1), dtype=np.complex64)
for k in range(half_embd):
for l in range(n_codes):
mag = E[k, l]
phi = E[k + half_embd, l]
mag = np.clip(np.exp(mag), 0, 1e2)
S[l, k] = mag * np.exp(1j * phi)
res = np.zeros(n_codes * n_fft)
hann2_buffer = np.zeros(n_codes * n_fft)
with ThreadPoolExecutor(max_workers=n_thread) as executor:
args = [(l, n_fft, S, hann) for l in range(n_codes)]
results = list(executor.map(process_frame, args))
for l, (frame, hann2) in enumerate(results):
res[l*n_fft:(l+1)*n_fft] = frame
hann2_buffer[l*n_fft:(l+1)*n_fft] = hann2
audio = fold(res, n_out, n_win, n_hop, n_pad)
env = fold(hann2_buffer, n_out, n_win, n_hop, n_pad)
mask = env > 1e-10
audio[mask] /= env[mask]
return audio
def save_wav(filename, audio_data, sample_rate):
num_channels = 1
bits_per_sample = 16
bytes_per_sample = bits_per_sample // 8
data_size = len(audio_data) * bytes_per_sample
byte_rate = sample_rate * num_channels * bytes_per_sample
block_align = num_channels * bytes_per_sample
chunk_size = 36 + data_size # 36 = size of header minus first 8 bytes
header = struct.pack(
'<4sI4s4sIHHIIHH4sI',
b'RIFF',
chunk_size,
b'WAVE',
b'fmt ',
16, # fmt chunk size
1, # audio format (PCM)
num_channels,
sample_rate,
byte_rate,
block_align,
bits_per_sample,
b'data',
data_size
)
audio_data = np.clip(audio_data * 32767, -32768, 32767)
pcm_data = audio_data.astype(np.int16)
with open(filename, 'wb') as f:
f.write(header)
f.write(pcm_data.tobytes())
def process_text(text: str): def process_text(text: str):
text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
@ -170,6 +285,15 @@ n_embd = len(embd[0])
print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd)) print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))
# post-process the spectrogram to convert to audio # post-process the spectrogram to convert to audio
# TODO: see the tts.cpp:embd_to_audio() and implement it in Python
print('converting to audio ...') print('converting to audio ...')
print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python') audio = embd_to_audio(embd, n_codes, n_embd)
print('audio generated: %d samples' % len(audio))
filename = "output.wav"
sample_rate = 24000 # sampling rate
# zero out first 0.25 seconds
audio[:24000 // 4] = 0.0
save_wav(filename, audio, sample_rate)
print('audio written to file "%s"' % filename)

View file

@ -414,15 +414,15 @@ static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
prompt.insert(prompt.end(), tokens.begin(), tokens.end()); prompt.insert(prompt.end(), tokens.begin(), tokens.end());
} }
static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) { static void prompt_add(llama_tokens & prompt, const llama_vocab * vocab, const std::string & txt, bool add_special, bool parse_special) {
auto tmp = common_tokenize(model, txt, add_special, parse_special); auto tmp = common_tokenize(vocab, txt, add_special, parse_special);
prompt_add(prompt, tmp); prompt_add(prompt, tmp);
} }
static void prompt_init(llama_tokens & prompt, const llama_model * model) { static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
prompt.clear(); prompt.clear();
prompt_add(prompt, model, "<|im_start|>\n", true, true); prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
@ -462,6 +462,8 @@ int main(int argc, char ** argv) {
model_ttc = llama_init_ttc.model.get(); model_ttc = llama_init_ttc.model.get();
ctx_ttc = llama_init_ttc.context.get(); ctx_ttc = llama_init_ttc.context.get();
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
// TODO: refactor in a common struct // TODO: refactor in a common struct
params.model = params.vocoder.model; params.model = params.vocoder.model;
params.model_url = params.vocoder.model_url; params.model_url = params.vocoder.model_url;
@ -499,9 +501,9 @@ int main(int argc, char ** argv) {
std::vector<llama_token> prompt_inp; std::vector<llama_token> prompt_inp;
prompt_init(prompt_inp, model_ttc); prompt_init(prompt_inp, vocab);
prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true); prompt_add(prompt_inp, vocab, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
// convert the input text into the necessary format expected by OuteTTS // convert the input text into the necessary format expected by OuteTTS
{ {
@ -509,10 +511,10 @@ int main(int argc, char ** argv) {
LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str()); LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
prompt_add(prompt_inp, model_ttc, prompt_clean, false, true); prompt_add(prompt_inp, vocab, prompt_clean, false, true);
} }
prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true); prompt_add(prompt_inp, vocab, "<|text_end|>\n", false, true);
// disabled to save time on tokenizing each time // disabled to save time on tokenizing each time
// TODO: load voices from the json files // TODO: load voices from the json files
@ -549,7 +551,7 @@ it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><
looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|> looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)"; lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
auto tmp = common_tokenize(model_ttc, voice_data, false, true); auto tmp = common_tokenize(vocab, voice_data, false, true);
printf("\n\n"); printf("\n\n");
for (int i = 0; i < tmp.size(); ++i) { for (int i = 0; i < tmp.size(); ++i) {
printf("%d, ", tmp[i]); printf("%d, ", tmp[i]);
@ -735,9 +737,9 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
const auto * cands = common_sampler_get_candidates(smpl[i]); const auto * cands = common_sampler_get_candidates(smpl[i]);
// is it an end of generation? -> mark the stream as finished // is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model_ttc, new_token_id) || n_decode == n_predict) { if (llama_vocab_is_eog(vocab, new_token_id) || n_decode == n_predict) {
std::string reason; std::string reason;
if (llama_token_is_eog(model_ttc, new_token_id)) { if (llama_vocab_is_eog(vocab, new_token_id)) {
reason = "eos"; reason = "eos";
} else { } else {
reason = "n_predict"; reason = "n_predict";
@ -873,7 +875,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
#if 1 #if 1
// spectral operations // spectral operations
const int n_embd = llama_n_embd(model_cts); const int n_embd = llama_model_n_embd(model_cts);
const float * embd = llama_get_embeddings(ctx_cts); const float * embd = llama_get_embeddings(ctx_cts);
auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads); auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);

View file

@ -185,6 +185,9 @@ option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increas
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON) option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON) option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
# toolchain for vulkan-shaders-gen
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
# extra artifacts # extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE}) option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})

View file

@ -203,6 +203,8 @@ extern "C" {
// Backend registry // Backend registry
// //
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Backend (reg) enumeration // Backend (reg) enumeration
GGML_API size_t ggml_backend_reg_count(void); GGML_API size_t ggml_backend_reg_count(void);
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index); GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);

View file

@ -501,6 +501,7 @@ extern "C" {
GGML_OP_GET_REL_POS, GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS, GGML_OP_ADD_REL_POS,
GGML_OP_RWKV_WKV6, GGML_OP_RWKV_WKV6,
GGML_OP_GATED_LINEAR_ATTN,
GGML_OP_UNARY, GGML_OP_UNARY,
@ -1383,16 +1384,20 @@ extern "C" {
float scale, float scale,
float max_bias); float max_bias);
GGML_API struct ggml_tensor * ggml_soft_max_back( GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b,
float scale,
float max_bias);
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace( GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b,
float scale,
float max_bias);
// rotary position embedding // rotary position embedding
// if (mode & 1) - skip n_past elements (NOT SUPPORTED) // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
@ -1499,7 +1504,7 @@ extern "C" {
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
// a - dy // a - dy
GGML_API struct ggml_tensor * ggml_rope_back( GGML_API struct ggml_tensor * ggml_rope_ext_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // gradients of ggml_rope result struct ggml_tensor * a, // gradients of ggml_rope result
struct ggml_tensor * b, // positions struct ggml_tensor * b, // positions
@ -1514,6 +1519,23 @@ extern "C" {
float beta_fast, float beta_fast,
float beta_slow); float beta_slow);
GGML_API struct ggml_tensor * ggml_rope_multi_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int sections[4],
int mode,
int n_ctx_orig,
float freq_base,
float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow);
// clamp // clamp
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_clamp( GGML_API struct ggml_tensor * ggml_clamp(
@ -1859,6 +1881,15 @@ extern "C" {
struct ggml_tensor * td, struct ggml_tensor * td,
struct ggml_tensor * state); struct ggml_tensor * state);
GGML_API struct ggml_tensor * ggml_gated_linear_attn(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * q,
struct ggml_tensor * g,
struct ggml_tensor * state,
float scale);
// custom operators // custom operators
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);

View file

@ -37,6 +37,7 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
return true; return true;
} }
// ops that return true for this function must not use restrict pointers for their backend implementations
static bool ggml_op_can_inplace(enum ggml_op op) { static bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) { switch (op) {
case GGML_OP_SCALE: case GGML_OP_SCALE:
@ -52,8 +53,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
case GGML_OP_LOG: case GGML_OP_LOG:
case GGML_OP_UNARY: case GGML_OP_UNARY:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
case GGML_OP_SILU_BACK:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_SOFT_MAX_BACK:
return true; return true;
default: default:

View file

@ -208,7 +208,6 @@ extern "C" {
// Internal backend registry API // Internal backend registry API
GGML_API void ggml_backend_register(ggml_backend_reg_t reg); GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Add backend dynamic loading support to the backend // Add backend dynamic loading support to the backend

View file

@ -5573,7 +5573,88 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
uint32_t utmp[4]; uint32_t utmp[4];
#ifdef __ARM_NEON #ifdef __ARM_FEATURE_SVE
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
memcpy(utmp, x[i].scales, K_SCALE_SIZE);
uint32x2_t mins8 = { 0 };
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[0] &= kmask1;
const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
sumf -= dmin * vaddvq_s32(prod);
const uint8_t * scales = (const uint8_t *)utmp;
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
const int vector_length = ggml_cpu_get_sve_cnt()*8;
const svuint8_t m4b = svdup_n_u8(0xf);
const svint32_t mzero = svdup_n_s32(0);
svint32_t sumi1 = svdup_n_s32(0);
svint32_t sumi1_1 = svdup_n_s32(0);
svint32_t sumi1_2 = svdup_n_s32(0);
svint32_t sumi2 = svdup_n_s32(0);
svint32_t sumi2_1 = svdup_n_s32(0);
svint32_t sumi2_2 = svdup_n_s32(0);
switch (vector_length) {
case 128:
{
for (int j = 0; j < QK_K/64; ++j) {
svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b));
svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b));
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4));
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4));
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
q4 += 32;
}
sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2);
sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2);
sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2)));
} break;
case 256:
case 512:
{
for (int j = 0; j < QK_K/64; ++j) {
const svuint8_t q4bits = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32;
svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b));
svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4));
q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
}
sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2)));
} break;
default:
assert(false && "Unsupported vector length");
break;
}
}
*s = sumf;
#elif __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0); const int32x4_t mzero = vdupq_n_s32(0);

View file

@ -3967,6 +3967,57 @@ static void ggml_compute_forward_dup_bytes(
} }
} }
static void ggml_compute_forward_dup_q(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
GGML_TENSOR_BINARY_OP_LOCALS
const enum ggml_type type = src0->type;
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
size_t qk = ggml_blck_size(type);
const int64_t nr = ggml_nelements(src1) / qk;
// destination must be contiguous in the first dimension
GGML_ASSERT(nb10 == ggml_type_size(dst->type));
// must either have first dimension large enough to hold a row, or fully contiguous
GGML_ASSERT((ne10 % qk) == 0 || ggml_is_contiguous(dst));
const int ith = params->ith;
const int nth = params->nth;
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int64_t ir = ir0; ir < ir1; ++ir) {
uint32_t i = ir * qk;
const int64_t i03 = i/(ne00 * ne01 * ne02);
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int64_t i13 = i/(ne10 * ne11 * ne12);
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
dequantize_row_q(
(const void *) ((char *) src0->data + x_offset),
(float *) ((char *) dst->data + dst_offset), qk);
}
}
static void ggml_compute_forward_dup( static void ggml_compute_forward_dup(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -3993,6 +4044,10 @@ static void ggml_compute_forward_dup(
} break; } break;
default: default:
{ {
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
ggml_compute_forward_dup_q(params, dst);
break;
}
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
} }
@ -6691,20 +6746,20 @@ static void ggml_compute_forward_silu_back_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * grad = dst->src[0];
const struct ggml_tensor * grad = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
assert(ggml_is_contiguous_1(grad)); assert(ggml_is_contiguous_1(grad));
assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(src1));
assert(ggml_is_contiguous_1(dst)); assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src0, dst)); assert(ggml_are_same_shape(src1, dst));
assert(ggml_are_same_shape(src0, grad)); assert(ggml_are_same_shape(src1, grad));
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
const int nc = src0->ne[0]; const int nc = src1->ne[0];
const int nr = ggml_nrows(src0); const int nr = ggml_nrows(src1);
// rows per thread // rows per thread
const int dr = (nr + nth - 1)/nth; const int dr = (nr + nth - 1)/nth;
@ -6716,7 +6771,7 @@ static void ggml_compute_forward_silu_back_f32(
for (int i1 = ir0; i1 < ir1; i1++) { for (int i1 = ir0; i1 < ir1; i1++) {
ggml_vec_silu_backward_f32(nc, ggml_vec_silu_backward_f32(nc,
(float *) ((char *) dst->data + i1*( dst->nb[1])), (float *) ((char *) dst->data + i1*( dst->nb[1])),
(float *) ((char *) src0->data + i1*(src0->nb[1])), (float *) ((char *) src1->data + i1*(src1->nb[1])),
(float *) ((char *) grad->data + i1*(grad->nb[1]))); (float *) ((char *) grad->data + i1*(grad->nb[1])));
#ifndef NDEBUG #ifndef NDEBUG
@ -6895,7 +6950,7 @@ static void ggml_compute_forward_norm_f32(
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps > 0.0f); GGML_ASSERT(eps >= 0.0f);
// TODO: optimize // TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -6966,7 +7021,7 @@ static void ggml_compute_forward_rms_norm_f32(
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps > 0.0f); GGML_ASSERT(eps >= 0.0f);
// TODO: optimize // TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -7018,12 +7073,13 @@ static void ggml_compute_forward_rms_norm_back_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0]; // gradients from forward pass output
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1]; // src1 from forward pass
GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT(src0->nb[0] == sizeof(float));
GGML_ASSERT(src1->nb[0] == sizeof(float));
const int ith = params->ith; const int ith = params->ith;
const int nth = params->nth; const int nth = params->nth;
@ -7042,8 +7098,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
const int64_t i12 = i02; const int64_t i12 = i02;
const int64_t i13 = i03; const int64_t i13 = i03;
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13); const float * x = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
ggml_float sum_xx = 0.0; ggml_float sum_xx = 0.0;
ggml_float sum_xdz = 0.0; ggml_float sum_xdz = 0.0;
@ -7066,9 +7122,9 @@ static void ggml_compute_forward_rms_norm_back_f32(
{ {
// z = rms_norm(x) // z = rms_norm(x)
// //
// rms_norm(src0) = // rms_norm(src1) =
// scale( // scale(
// src0, // src1,
// div( // div(
// 1, // 1,
// sqrt( // sqrt(
@ -7076,13 +7132,13 @@ static void ggml_compute_forward_rms_norm_back_f32(
// scale( // scale(
// sum( // sum(
// sqr( // sqr(
// src0)), // src1)),
// (1.0/N)), // (1.0/N)),
// eps)))); // eps))));
// postorder: // postorder:
// ## op args grad // ## op args grad
// 00 param src0 grad[#00] // 00 param src1 grad[#00]
// 01 const 1 // 01 const 1
// 02 sqr (#00) grad[#02] // 02 sqr (#00) grad[#02]
// 03 sum (#02) grad[#03] // 03 sum (#02) grad[#03]
@ -7159,6 +7215,7 @@ static void ggml_compute_forward_rms_norm_back_f32(
// dx := scale(dx, rrms) // dx := scale(dx, rrms)
float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
// dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps)
ggml_vec_cpy_f32 (ne00, dx, x); ggml_vec_cpy_f32 (ne00, dx, x);
// ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps); // ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps); ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
@ -7752,10 +7809,11 @@ static void ggml_compute_forward_out_prod_f32(
GGML_ASSERT(ne0 == ne00); GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10); GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 == ne02); GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne3 == ne13); GGML_ASSERT(ne3 == ne13);
GGML_ASSERT(ne03 == ne13);
GGML_ASSERT(ne2 % ne02 == 0);
GGML_ASSERT(ne3 % ne03 == 0);
// we don't support permuted src0 or src1 // we don't support permuted src0 or src1
GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float));
@ -7797,6 +7855,10 @@ static void ggml_compute_forward_out_prod_f32(
const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32); const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
const int64_t blck_1 = 16; const int64_t blck_1 = 16;
// dps == dst per src0, used for group query attention
const int64_t dps2 = ne2 / ne02;
const int64_t dps3 = ne3 / ne03;
for (int64_t bir = ir0; bir < ir1; bir += blck_1) { for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
const int64_t bir1 = MIN(bir + blck_1, ir1); const int64_t bir1 = MIN(bir + blck_1, ir1);
for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) { for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
@ -7807,8 +7869,8 @@ static void ggml_compute_forward_out_prod_f32(
const int64_t i2 = (ir - i3*ne2*ne1)/ne1; const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1); const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
const int64_t i02 = i2; const int64_t i02 = i2 / dps2;
const int64_t i03 = i3; const int64_t i03 = i3 / dps3;
//const int64_t i10 = i1; //const int64_t i10 = i1;
const int64_t i12 = i2; const int64_t i12 = i2;
@ -8906,9 +8968,9 @@ static void ggml_compute_forward_soft_max(
} }
// ggml_compute_forward_soft_max_back // ggml_compute_forward_soft_max_ext_back
static void ggml_compute_forward_soft_max_back_f32( static void ggml_compute_forward_soft_max_ext_back_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -8921,6 +8983,14 @@ static void ggml_compute_forward_soft_max_back_f32(
GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src1, dst)); GGML_ASSERT(ggml_are_same_shape(src1, dst));
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
GGML_ASSERT(max_bias == 0.0f);
// TODO: handle transposed/permuted matrices // TODO: handle transposed/permuted matrices
const int ith = params->ith; const int ith = params->ith;
@ -8971,8 +9041,9 @@ static void ggml_compute_forward_soft_max_back_f32(
float dot_y_dy = 0; float dot_y_dy = 0;
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1); ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
ggml_vec_cpy_f32 (nc, dx, dy); ggml_vec_cpy_f32 (nc, dx, dy);
ggml_vec_acc1_f32(nc, dx, -dot_y_dy); ggml_vec_acc1_f32 (nc, dx, -dot_y_dy);
ggml_vec_mul_f32 (nc, dx, dx, y); ggml_vec_mul_f32 (nc, dx, dx, y);
ggml_vec_scale_f32(nc, dx, scale);
#ifndef NDEBUG #ifndef NDEBUG
for (int i = 0; i < nc; ++i) { for (int i = 0; i < nc; ++i) {
@ -8983,7 +9054,7 @@ static void ggml_compute_forward_soft_max_back_f32(
} }
} }
static void ggml_compute_forward_soft_max_back( static void ggml_compute_forward_soft_max_ext_back(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
@ -8992,7 +9063,7 @@ static void ggml_compute_forward_soft_max_back(
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
ggml_compute_forward_soft_max_back_f32(params, dst); ggml_compute_forward_soft_max_ext_back_f32(params, dst);
} break; } break;
default: default:
{ {
@ -9985,9 +10056,10 @@ static void ggml_compute_forward_im2col_back_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1]; // convolution kernel
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -10009,11 +10081,11 @@ static void ggml_compute_forward_im2col_back_f32(
const int64_t IH = is_2D ? ne1 : 1; const int64_t IH = is_2D ? ne1 : 1;
const int64_t IW = ne0; const int64_t IW = ne0;
const int64_t KH = is_2D ? ne01 : 1; const int64_t KH = is_2D ? ne11 : 1;
const int64_t KW = ne00; const int64_t KW = ne10;
const int64_t OH = is_2D ? ne12 : 1; const int64_t OH = is_2D ? ne02 : 1;
const int64_t OW = ne11; const int64_t OW = ne01;
int ofs0 = is_2D ? nb3 : nb2; int ofs0 = is_2D ? nb3 : nb2;
int ofs1 = is_2D ? nb2 : nb1; int ofs1 = is_2D ? nb2 : nb1;
@ -10059,9 +10131,9 @@ static void ggml_compute_forward_im2col_back_f32(
continue; continue;
} }
const float * const src_data = (const float *) src1->data const float * const grad_in = (const float *) src0->data
+ (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
grad += src_data[iic*(KH*KW) + ikh*KW + ikw]; grad += grad_in[iic*(KH*KW) + ikh*KW + ikw];
} }
} }
float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW] float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
@ -11803,9 +11875,9 @@ static void ggml_compute_forward_add_rel_pos(
static void ggml_compute_forward_rwkv_wkv6_f32( static void ggml_compute_forward_rwkv_wkv6_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const int64_t T = dst->src[1]->ne[3]; const int64_t T = dst->src[1]->ne[2];
const int64_t C = dst->ne[0]; const int64_t C = dst->ne[0];
const int64_t HEADS = dst->src[1]->ne[2]; const int64_t HEADS = dst->src[1]->ne[1];
const int64_t n_seqs = dst->src[5]->ne[1]; const int64_t n_seqs = dst->src[5]->ne[1];
const int64_t head_size = C / HEADS; const int64_t head_size = C / HEADS;
@ -12000,6 +12072,197 @@ static void ggml_compute_forward_rwkv_wkv6(
} }
} }
// ggml_compute_forward_gla
static void ggml_compute_forward_gla_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const int64_t T = dst->src[1]->ne[2];
const int64_t C = dst->ne[0];
const int64_t HEADS = dst->src[1]->ne[1];
const int64_t n_seqs = dst->src[4]->ne[1];
const int64_t head_size = C / HEADS;
const float scale = ggml_get_op_params_f32(dst, 0);
float * dst_data = (float *) dst->data;
float * state = ((float *) dst->data) + C * T;
const int ith = params->ith;
const int nth = params->nth;
if (ith >= HEADS) {
return;
}
const int h_start = (HEADS * ith) / nth;
const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ?
(HEADS * (ith + 1)) / nth : HEADS;
float * k = (float *) dst->src[0]->data;
float * v = (float *) dst->src[1]->data;
float * q = (float *) dst->src[2]->data;
float * g = (float *) dst->src[3]->data;
size_t t_stride = HEADS * head_size; // Same to C
size_t h_stride = C / HEADS;
GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS
size_t h_stride_2d = head_size * head_size;
if (ith == 0) {
memset(dst_data, 0, T * C * sizeof(float));
}
ggml_barrier(params->threadpool);
#if defined(__AVX__) && !defined(__AVX512F__)
#define GGML_F32X GGML_F32x8
#define GGML_F32X_SET1 GGML_F32x8_SET1
#define GGML_F32X_LOAD GGML_F32x8_LOAD
#define GGML_F32X_STORE GGML_F32x8_STORE
#define GGML_F32X_MUL GGML_F32x8_MUL
#define GGML_F32X_FMA GGML_F32x8_FMA
#define GLA_VECTOR_SIZE 8
#elif defined(__AVX512F__)
#define GGML_F32X GGML_F32x16
#define GGML_F32X_SET1 GGML_F32x16_SET1
#define GGML_F32X_LOAD GGML_F32x16_LOAD
#define GGML_F32X_STORE GGML_F32x16_STORE
#define GGML_F32X_MUL GGML_F32x16_MUL
#define GGML_F32X_FMA GGML_F32x16_FMA
#define GLA_VECTOR_SIZE 16
#elif defined(__ARM_NEON) && defined(__aarch64__)
#define GGML_F32X GGML_F32x4
#define GGML_F32X_SET1 GGML_F32x4_SET1
#define GGML_F32X_LOAD GGML_F32x4_LOAD
#define GGML_F32X_STORE GGML_F32x4_STORE
#define GGML_F32X_MUL GGML_F32x4_MUL
#define GGML_F32X_FMA GGML_F32x4_FMA
#define GLA_VECTOR_SIZE 4
#endif
#ifdef GLA_VECTOR_SIZE
const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
for (int64_t t = 0; t < T; t++) {
size_t t_offset = t * t_stride;
size_t state_offset = head_size * C * (t / (T / n_seqs));
float * state_cur = state + state_offset;
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset;
for (int64_t h = h_start; h < h_end; h++) {
size_t h_offset = h * h_stride;
size_t t_h_offset = t_offset + h_offset;
size_t h_2d_offset = h * h_stride_2d;
for (int64_t i = 0; i < head_size; i++) {
size_t t_h_i_offset = t_h_offset + i;
size_t h_2d_i_offset = h_2d_offset + i * h_stride;
float k_val = k[t_h_i_offset];
float q_val = q[t_h_i_offset] * scale;
float g_val = g[t_h_i_offset];
// Broadcast scalar values to vectors
GGML_F32X k_vec = GGML_F32X_SET1(k_val);
GGML_F32X q_vec = GGML_F32X_SET1(q_val);
GGML_F32X g_vec = GGML_F32X_SET1(g_val);
for (int64_t j = 0; j < vec_count; j++) {
size_t base_j = j * GLA_VECTOR_SIZE;
size_t t_h_j_offset = t_h_offset + base_j;
size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
// Load x elements at once
GGML_F32X v_vec = GGML_F32X_LOAD(&v[t_h_j_offset]);
GGML_F32X prev_state_vec = GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]);
GGML_F32X dst_vec = GGML_F32X_LOAD(&dst_data[t_h_j_offset]);
// Compute kv = v * k
GGML_F32X kv_vec = GGML_F32X_MUL(v_vec, k_vec);
// Compute temp = prev_state * g + kv
GGML_F32X temp_vec = GGML_F32X_FMA(kv_vec, prev_state_vec, g_vec);
// Update dst: dst += temp * q
dst_vec = GGML_F32X_FMA(dst_vec, temp_vec, q_vec);
GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec);
// Update state
GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], temp_vec);
}
// Handle remaining elements, this will not be used.
for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
size_t t_h_j_offset = t_h_offset + j;
size_t h_2d_i_j_offset = h_2d_i_offset + j;
float v_val = v[t_h_j_offset];
float kv_val = v_val * k_val;
float prev_state_val = state_prev[h_2d_i_j_offset];
float temp_val = kv_val + prev_state_val * g_val;
dst_data[t_h_j_offset] += temp_val * q_val;
state_cur[h_2d_i_j_offset] = temp_val;
}
}
}
}
#else
for (int64_t t = 0; t < T; t++) {
size_t t_offset = t * t_stride;
size_t state_offset = head_size * C * (t / (T / n_seqs));
float * state_cur = state + state_offset;
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset;
for (int64_t h = h_start; h < h_end; h++) {
size_t h_offset = h * h_stride;
size_t t_h_offset = t_offset + h_offset;
size_t h_2d_offset = h * h_stride_2d;
for (int64_t i = 0; i < head_size; i++) {
size_t t_h_i_offset = t_h_offset + i;
size_t h_2d_i_offset = h_2d_offset + i * h_stride;
float k_val = k[t_h_i_offset];
float q_val = q[t_h_i_offset] * scale;
float g_val = g[t_h_i_offset];
for (int64_t j = 0; j < head_size; j++) {
size_t t_h_j_offset = t_h_offset + j;
size_t h_2d_i_j_offset = h_2d_i_offset + j;
float v_val = v[t_h_j_offset];
float kv_val = v_val * k_val;
float prev_state_val = state_prev[h_2d_i_j_offset];
float temp_val = prev_state_val * g_val + kv_val;
dst_data[t_h_j_offset] += temp_val * q_val;
state_cur[h_2d_i_j_offset] = temp_val;
}
}
}
}
#endif
}
static void ggml_compute_forward_gla(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_gla_f32(params, dst);
} break;
default:
{
GGML_ABORT("fatal error");
}
}
}
// ggml_compute_forward_map_unary // ggml_compute_forward_map_unary
static void ggml_compute_forward_map_unary_f32( static void ggml_compute_forward_map_unary_f32(
@ -12293,22 +12556,22 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * grad = dst->src[0]; // gradient of forward pass output
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src0f = dst->src[1]; // src0 of forward pass
const struct ggml_tensor * opt0 = dst->src[2]; const struct ggml_tensor * src1f = dst->src[2]; // src1 of forward pass
GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0f));
GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_contiguous(src1f));
GGML_ASSERT(ggml_is_contiguous(opt0)); GGML_ASSERT(ggml_is_contiguous(grad));
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src0f, src1f) && ggml_are_same_shape(src0f, dst));
const int64_t ith = params->ith; const int64_t ith = params->ith;
const int64_t nth = params->nth; const int64_t nth = params->nth;
// TODO: handle transposed/permuted matrices // TODO: handle transposed/permuted matrices
const int64_t nc = src0->ne[0]; const int64_t nc = src0f->ne[0];
const int64_t nr = ggml_nrows(src0); const int64_t nr = ggml_nrows(src0f);
// rows per thread // rows per thread
const int64_t dr = (nr + nth - 1)/nth; const int64_t dr = (nr + nth - 1)/nth;
@ -12317,12 +12580,12 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
const int64_t ir0 = dr*ith; const int64_t ir0 = dr*ith;
const int64_t ir1 = MIN(ir0 + dr, nr); const int64_t ir1 = MIN(ir0 + dr, nr);
const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr; const float d_by_nr = ((const float *) grad->data)[0] / (float) nr;
for (int64_t i1 = ir0; i1 < ir1; i1++) { for (int64_t i1 = ir0; i1 < ir1; i1++) {
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); const float * s0 = (const float *)((const char *) src0f->data + i1*src0f->nb[1]);
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); const float * s1 = (const float *)((const char *) src1f->data + i1*src1f->nb[1]);
#ifndef NDEBUG #ifndef NDEBUG
for (int64_t i = 0; i < nc; ++i) { for (int64_t i = 0; i < nc; ++i) {
@ -12335,11 +12598,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
// soft_max // soft_max
float max = -INFINITY; float max = -INFINITY;
ggml_vec_max_f32(nc, &max, s0); ggml_vec_max_f32(nc, &max, s0);
ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max); const ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
assert(sum > 0.0); assert(sum > 0.0);
ggml_vec_scale_f32(nc, ds0, 1.0/sum); ggml_vec_scale_f32(nc, ds0, 1.0/sum);
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr // grad(src0f) = (softmax(src0f) - src1f) * grad(cross_entropy_loss(src0f, src1f)) / nr
ggml_vec_sub_f32(nc, ds0, ds0, s1); ggml_vec_sub_f32(nc, ds0, ds0, s1);
ggml_vec_scale_f32(nc, ds0, d_by_nr); ggml_vec_scale_f32(nc, ds0, d_by_nr);
@ -12636,7 +12899,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break; } break;
case GGML_OP_SOFT_MAX_BACK: case GGML_OP_SOFT_MAX_BACK:
{ {
ggml_compute_forward_soft_max_back(params, tensor); ggml_compute_forward_soft_max_ext_back(params, tensor);
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
@ -12749,6 +13012,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_rwkv_wkv6(params, tensor); ggml_compute_forward_rwkv_wkv6(params, tensor);
} break; } break;
case GGML_OP_GATED_LINEAR_ATTN:
{
ggml_compute_forward_gla(params, tensor);
} break;
case GGML_OP_MAP_UNARY: case GGML_OP_MAP_UNARY:
{ {
ggml_unary_op_f32_t fun; ggml_unary_op_f32_t fun;
@ -13047,6 +13314,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_WIN_UNPART: case GGML_OP_WIN_UNPART:
case GGML_OP_GET_REL_POS: case GGML_OP_GET_REL_POS:
case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV6:
case GGML_OP_GATED_LINEAR_ATTN:
case GGML_OP_MAP_UNARY: case GGML_OP_MAP_UNARY:
case GGML_OP_MAP_BINARY: case GGML_OP_MAP_BINARY:
case GGML_OP_MAP_CUSTOM1_F32: case GGML_OP_MAP_CUSTOM1_F32:
@ -13472,6 +13740,7 @@ struct ggml_cplan ggml_graph_plan(
} break; } break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
{ {
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
} break; } break;

View file

@ -403,8 +403,16 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type; return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
case GGML_OP_ROPE_BACK: case GGML_OP_SOFT_MAX_BACK: {
return op->src[2] == NULL && (op->op_params[2] & 4) == 0; if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
return false;
}
float max_bias = 0.0f;
memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
return max_bias == 0.0f;
}
case GGML_OP_IM2COL_BACK: case GGML_OP_IM2COL_BACK:
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32; return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD: case GGML_OP_OUT_PROD:

View file

@ -5,95 +5,89 @@
#include <cmath> #include <cmath>
#include <cstdint> #include <cstdint>
static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) { template <bool use_shared>
const int warp_id = threadIdx.x / WARP_SIZE; static __global__ void cross_entropy_loss_f32(
const int lane_id = threadIdx.x % WARP_SIZE; const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses, const int k) {
const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE; extern __shared__ float tmp[];
const int ne_tmp = WARP_SIZE*nclasses; logits += int64_t(blockIdx.x)*nclasses;
labels += int64_t(blockIdx.x)*nclasses;
extern __shared__ float tmp_all[];
float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
// Each warp first loads ne_tmp logits/labels into shared memory:
for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
const int ig = i0*nclasses + i; // ig == i global
tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
}
// Each thread in the warp then calculates the cross entropy loss for a single row.
// TODO: pad in order to avoid shared memory bank conflicts.
// Find maximum for softmax: // Find maximum for softmax:
float max = -INFINITY; float max_logit = -INFINITY;
for (int i = 0; i < nclasses; ++i) { for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
max = fmaxf(max, tmp_logits[lane_id*nclasses + i]); const float val = logits[i];
max_logit = fmaxf(max_logit, val);
if (use_shared) {
tmp[i] = val;
} }
}
max_logit = warp_reduce_max(max_logit);
// Calculate log(softmax(logits)) which is just logits - max: // Calculate log(softmax(logits)) which is just logits - max:
float sum = 0.0f; float sum = 0.0f;
for (int i = 0; i < nclasses; ++i) { for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
float val = tmp_logits[lane_id*nclasses + i] - max; const float logit_i = use_shared ? tmp[i] : logits[i];
sum += expf(val); sum += expf(logit_i - max_logit);
tmp_logits[lane_id*nclasses + i] = val;
} }
sum = warp_reduce_sum(sum);
sum = logf(sum); sum = logf(sum);
// log(exp(logits - max) / sum) = (logits - max) - log(sum) // log(exp(logits - max) / sum) = (logits - max) - log(sum)
float loss = 0.0f; float loss = 0.0f;
for (int i = 0; i < nclasses; ++i) { for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i]; const float logit_i = use_shared ? tmp[i] : logits[i];
loss += (logit_i - max_logit - sum) * labels[i];
} }
loss = -warp_reduce_sum(loss) / (float)k; loss = -warp_reduce_sum(loss) / (float)k;
__syncthreads(); if (threadIdx.x != 0) {
if (lane_id == 0) {
tmp_all[warp_id] = loss;
}
__syncthreads();
if (warp_id != 0) {
return;
}
loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
loss = warp_reduce_sum(loss);
if (lane_id != 0) {
return; return;
} }
dst[blockIdx.x] = loss; dst[blockIdx.x] = loss;
} }
static __global__ void cross_entropy_loss_back_f32(const float * logits, const float * labels, const float * loss, float * dst, const int nclasses) { template <bool use_shared>
static __global__ void cross_entropy_loss_back_f32(
const float * __restrict__ grad, const float * __restrict__ logits, const float * __restrict__ labels,
float * __restrict__ dst, const int nclasses) {
extern __shared__ float tmp[]; extern __shared__ float tmp[];
logits += int64_t(blockIdx.x)*nclasses;
labels += int64_t(blockIdx.x)*nclasses;
dst += int64_t(blockIdx.x)*nclasses;
float maxval = -INFINITY; float maxval = -INFINITY;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
const float val = logits[blockIdx.x*nclasses + i]; const float val = logits[i];
maxval = fmaxf(maxval, val); maxval = fmaxf(maxval, val);
if (use_shared) {
tmp[i] = val; tmp[i] = val;
} }
}
maxval = warp_reduce_max(maxval); maxval = warp_reduce_max(maxval);
float sum = 0.0f; float sum = 0.0f;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
const float val = expf(tmp[i] - maxval); const float val = expf((use_shared ? tmp[i] : logits[i]) - maxval);
sum += val; sum += val;
if (use_shared) {
tmp[i] = val; tmp[i] = val;
} else {
dst[i] = val;
}
} }
sum = warp_reduce_sum(sum); sum = warp_reduce_sum(sum);
const float sm_scale = 1.0f/sum; const float sm_scale = 1.0f/sum;
const float d_by_nrows = *loss/gridDim.x; const float d_by_nrows = *grad/gridDim.x;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) { for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
dst[blockIdx.x*nclasses + i] = (tmp[i]*sm_scale - labels[blockIdx.x*nclasses + i])*d_by_nrows; const float val = use_shared ? tmp[i] : dst[i];
dst[i] = (val*sm_scale - labels[i])*d_by_nrows;
} }
} }
@ -119,48 +113,77 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
ggml_cuda_pool & pool = ctx.pool(); ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1); const dim3 blocks_dim(WARP_SIZE, 1, 1);
const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1); const dim3 blocks_num(nrows, 1, 1);
const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float); const size_t nbytes_shared = ne00*sizeof(float);
const int id = ggml_cuda_get_device();
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x); ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows); if (nbytes_shared <= smpbo) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
shared_memory_limit_raised[id] = true;
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
} else {
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
}
CUDA_CHECK(cudaGetLastError());
// Combine results from individual blocks: // Combine results from individual blocks:
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream); sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
} }
void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * grad = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src0f = dst->src[1];
const ggml_tensor * opt0 = dst->src[2]; const ggml_tensor * src1f = dst->src[2];
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0f->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1f->type == GGML_TYPE_F32);
GGML_ASSERT(opt0->type == GGML_TYPE_F32); GGML_ASSERT( grad->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_scalar(grad));
GGML_ASSERT(ggml_is_contiguous(src1)); GGML_ASSERT(ggml_is_contiguous(src0f));
GGML_ASSERT(ggml_is_contiguous(opt0)); GGML_ASSERT(ggml_is_contiguous(src1f));
GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_are_same_shape(src0, src1)); GGML_ASSERT(ggml_are_same_shape(src0f, src1f));
GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src0f, dst));
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0f->ne[0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0f);
const float * src0_d = (const float *) src0->data; const float * grad_d = (const float *) grad->data;
const float * src1_d = (const float *) src1->data; const float * src0f_d = (const float *) src0f->data;
const float * opt0_d = (const float *) opt0->data; const float * src1f_d = (const float *) src1f->data;
float * dst_d = (float *) dst->data; float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
const dim3 blocks_dim(WARP_SIZE, 1, 1); const dim3 blocks_dim(WARP_SIZE, 1, 1);
const dim3 blocks_num(nrows, 1, 1); const dim3 blocks_num(nrows, 1, 1);
const int shmem = ne00*sizeof(float); const size_t nbytes_shared = ne00*sizeof(float);
cross_entropy_loss_back_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, opt0_d, dst_d, ne00); const int id = ggml_cuda_get_device();
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
if (nbytes_shared <= smpbo) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
shared_memory_limit_raised[id] = true;
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
} else {
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
}
} }

View file

@ -3,12 +3,12 @@
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t> template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows( static __global__ void k_get_rows(
const void * src0, const int32_t * src1, dst_t * dst, const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
/*size_t s0,*/ size_t s1, size_t s2, size_t s3, /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
size_t s10, size_t s11, size_t s12/*, size_t s13*/) { const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y; const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
@ -22,7 +22,7 @@ static __global__ void k_get_rows(
const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03;
const int ib = i00/qk; // block index const int ib = i00/qk; // block index
const int iqs = (i00%qk)/qr; // quant index const int iqs = (i00%qk)/qr; // quant index
@ -39,12 +39,12 @@ static __global__ void k_get_rows(
template<typename src0_t, typename dst_t> template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float( static __global__ void k_get_rows_float(
const src0_t * src0, const int32_t * src1, dst_t * dst, const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ /*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
/*size_t s0,*/ size_t s1, size_t s2, size_t s3, /*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
size_t s10, size_t s11, size_t s12/*, size_t s13*/) { const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
const int i00 = blockIdx.x*blockDim.x + threadIdx.x; const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y; const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
@ -58,13 +58,37 @@ static __global__ void k_get_rows_float(
const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = src0_row[i00]; dst_row[i00] = src0_row[i00];
} }
template<typename grad_t, typename dst_t>
static __global__ void k_get_rows_back_float(
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
const int col = blockIdx.x*blockDim.x + threadIdx.x;
if (col >= ncols) {
return;
}
const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
float sum = 0.0f;
for (int64_t i = 0; i < nrows_grad; ++i) {
if (rows[i] != dst_row) {
continue;
}
sum += grad[i*ncols + col];
}
dst[dst_row*ncols + col] = sum;
}
template<int qk, int qr, dequantize_kernel_t dq> template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, static void get_rows_cuda(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
@ -98,11 +122,14 @@ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, gg
} }
template<typename src0_t> template<typename src0_t>
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, static void get_rows_cuda_float(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(ne13 == 1);
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne10, ne11*ne12); const dim3 block_nums(block_num_x, ne10, ne11*ne12);
@ -132,11 +159,12 @@ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * sr
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
const void * src0_d = (const void *) src0->data;
const int32_t * src1_d = (const int32_t *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32);
@ -145,29 +173,27 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
const int32_t * src1_i32 = (const int32_t *) src1_d;
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream); get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream);
break; break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream);
break; break;
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break; break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break; break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break; break;
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream); get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
break; break;
default: default:
// TODO: k-quants // TODO: k-quants
@ -175,3 +201,34 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
break; break;
} }
} }
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
const ggml_tensor * src1 = dst->src[1]; // src1 in forward pass
GGML_TENSOR_BINARY_OP_LOCALS
const float * src0_d = (const float *) src0->data;
const int32_t * src1_d = (const int32_t *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ne02*ne03 == 1);
GGML_ASSERT(ne12*ne13 == 1);
GGML_ASSERT(ne2*ne3 == 1);
const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne1, 1);
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
}

View file

@ -1,5 +1,8 @@
#include "common.cuh" #include "common.cuh"
#define CUDA_GET_ROWS_BLOCK_SIZE 256 #define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -37,6 +37,7 @@
#include "ggml-cuda/unary.cuh" #include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh" #include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/wkv6.cuh" #include "ggml-cuda/wkv6.cuh"
#include "ggml-cuda/gla.cuh"
#include <algorithm> #include <algorithm>
#include <array> #include <array>
@ -2002,6 +2003,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
ggml_cuda_op_get_rows(ctx, dst); ggml_cuda_op_get_rows(ctx, dst);
break; break;
case GGML_OP_GET_ROWS_BACK:
ggml_cuda_op_get_rows_back(ctx, dst);
break;
case GGML_OP_DUP: case GGML_OP_DUP:
ggml_cuda_dup(ctx, dst); ggml_cuda_dup(ctx, dst);
break; break;
@ -2090,9 +2094,15 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
ggml_cuda_op_leaky_relu(ctx, dst); ggml_cuda_op_leaky_relu(ctx, dst);
break; break;
case GGML_OP_SILU_BACK:
ggml_cuda_op_silu_back(ctx, dst);
break;
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
ggml_cuda_op_rms_norm(ctx, dst); ggml_cuda_op_rms_norm(ctx, dst);
break; break;
case GGML_OP_RMS_NORM_BACK:
ggml_cuda_op_rms_norm_back(ctx, dst);
break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
@ -2137,9 +2147,15 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
ggml_cuda_op_soft_max(ctx, dst); ggml_cuda_op_soft_max(ctx, dst);
break; break;
case GGML_OP_SOFT_MAX_BACK:
ggml_cuda_op_soft_max_back(ctx, dst);
break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
ggml_cuda_op_rope(ctx, dst); ggml_cuda_op_rope(ctx, dst);
break; break;
case GGML_OP_ROPE_BACK:
ggml_cuda_op_rope_back(ctx, dst);
break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
ggml_cuda_op_im2col(ctx, dst); ggml_cuda_op_im2col(ctx, dst);
break; break;
@ -2167,6 +2183,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV6:
ggml_cuda_op_rwkv_wkv6(ctx, dst); ggml_cuda_op_rwkv_wkv6(ctx, dst);
break; break;
case GGML_OP_GATED_LINEAR_ATTN:
ggml_cuda_op_gated_linear_attn(ctx, dst);
break;
case GGML_OP_CROSS_ENTROPY_LOSS_BACK: case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
ggml_cuda_cross_entropy_loss_back(ctx, dst); ggml_cuda_cross_entropy_loss_back(ctx, dst);
break; break;
@ -2285,119 +2304,8 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
} }
#ifdef USE_CUDA_GRAPH #ifdef USE_CUDA_GRAPH
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
graph_node_properties->node_address = node->data; std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
graph_node_properties->node_op = node->op;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
graph_node_properties->ne[i] = node->ne[i];
graph_node_properties->nb[i] = node->nb[i];
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
}
memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
}
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
if (node->data != graph_node_properties->node_address &&
node->op != GGML_OP_CPY &&
node->op != GGML_OP_VIEW) {
return false;
}
if (node->op != graph_node_properties->node_op) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != graph_node_properties->ne[i]) {
return false;
}
if (node->nb[i] != graph_node_properties->nb[i]) {
return false;
}
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (node->src[i] &&
node->src[i]->data != graph_node_properties->src_address[i] &&
node->op != GGML_OP_CPY &&
node->op != GGML_OP_VIEW
) {
return false;
}
}
if (node->op == GGML_OP_SCALE &&
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
return false;
}
return true;
}
#endif
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
#ifdef USE_CUDA_GRAPH
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
// Objects required for CUDA Graph
if (cuda_ctx->cuda_graph == nullptr) {
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
}
bool use_cuda_graph = true;
bool cuda_graph_update_required = false;
// vector of pointers to CUDA cpy kernels, which are required to identify
// kernel parameters which need updated in the graph for each token
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
#endif
}
}
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
// or previous graph capture failure.
// Also disable for multi-gpu for now. TO DO investigate
if (disable_cuda_graphs_due_to_env
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
use_cuda_graph = false;
}
if (use_cuda_graph) {
if (cuda_ctx->cuda_graph->instance == nullptr) {
cuda_graph_update_required = true;
}
// Check if the graph size has changed
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
cuda_graph_update_required = true;
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
}
// Loop over nodes in GGML graph to determine if CUDA graph update is required
// and store properties to allow this comparison for the next token
for (int i = 0; i < cgraph->n_nodes; i++) {
bool has_matching_properties = true;
if (!cuda_graph_update_required) {
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
}
if (!has_matching_properties) {
cuda_graph_update_required = true;
}
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
}
// Loop over nodes in GGML graph to obtain info needed for CUDA graph // Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->updated_kernel_arg.clear(); cuda_ctx->cuda_graph->updated_kernel_arg.clear();
@ -2453,31 +2361,158 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
} }
} }
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. return use_cuda_graph;
if (use_cuda_graph && cuda_graph_update_required) { }
cuda_ctx->cuda_graph->number_consecutive_updates++;
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
graph_node_properties->node_address = node->data;
graph_node_properties->node_op = node->op;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
graph_node_properties->ne[i] = node->ne[i];
graph_node_properties->nb[i] = node->nb[i];
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
}
memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
}
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
if (node->data != graph_node_properties->node_address &&
node->op != GGML_OP_CPY &&
node->op != GGML_OP_VIEW) {
return false;
}
if (node->op != graph_node_properties->node_op) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != graph_node_properties->ne[i]) {
return false;
}
if (node->nb[i] != graph_node_properties->nb[i]) {
return false;
}
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (node->src[i] &&
node->src[i]->data != graph_node_properties->src_address[i] &&
node->op != GGML_OP_CPY &&
node->op != GGML_OP_VIEW
) {
return false;
}
}
if (node->op == GGML_OP_SCALE &&
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
return false;
}
return true;
}
static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
if (cuda_graph_update_required) {
// Extract nodes from graph
// First call with null argument gets number of nodes in graph
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
// Subsequent call with non-null argument gets nodes
cuda_ctx->cuda_graph->nodes.clear();
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
cuda_ctx->cuda_graph->params.clear();
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
if (cuda_ctx->cuda_graph->num_nodes > 0) {
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
// Loop over nodes, and extract kernel parameters from each node
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
cudaGraphNodeType node_type;
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
if (node_type == cudaGraphNodeTypeKernel) {
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
if (stat == cudaErrorInvalidDeviceFunction) {
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
// We don't need to update blas nodes, so clear error and move on.
cudaGetLastError();
} else { } else {
cuda_ctx->cuda_graph->number_consecutive_updates = 0; GGML_ASSERT(stat == cudaSuccess);
} }
}
}
}
} else {
// One of the arguments to the copy kernel is updated for each token, hence we need to
// replace that argument with the updated value in the CUDA graph
// on update steps, the live parameters will already be captured
int k = 0;
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
}
}
}
}
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif
}
}
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
}
#else
bool use_cuda_graph = false;
bool cuda_graph_update_required = false; bool cuda_graph_update_required = false;
#endif // USE_CUDA_GRAPH
bool graph_evaluated_or_captured = false; if (cuda_ctx->cuda_graph->instance == nullptr) {
cuda_graph_update_required = true;
}
// Check if the graph size has changed
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
cuda_graph_update_required = true;
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
}
// Loop over nodes in GGML graph to determine if CUDA graph update is required
// and store properties to allow this comparison for the next token
for (int i = 0; i < cgraph->n_nodes; i++) {
bool has_matching_properties = true;
if (!cuda_graph_update_required) {
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
}
if (!has_matching_properties) {
cuda_graph_update_required = true;
}
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
}
return cuda_graph_update_required;
}
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
cudaGraphExecUpdateResultInfo result_info;
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
if (stat == cudaErrorGraphExecUpdateFailure) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
#endif
// The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate
cudaGetLastError();
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
cuda_ctx->cuda_graph->instance = nullptr;
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
} else {
GGML_ASSERT(stat == cudaSuccess);
}
}
#endif
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
[[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool & graph_evaluated_or_captured, bool & use_cuda_graph,
bool & cuda_graph_update_required) {
while (!graph_evaluated_or_captured) { while (!graph_evaluated_or_captured) {
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@ -2515,19 +2550,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph)); CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
cuda_ctx->cuda_graph->graph = nullptr; cuda_ctx->cuda_graph->graph = nullptr;
} }
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
#if 0 CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
if (disable_cuda_graphs_due_to_failed_capture) {
use_cuda_graph = false;
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
#endif
} else {
graph_evaluated_or_captured = true; // CUDA graph has been captured
}
#endif
graph_evaluated_or_captured = true; // CUDA graph has been captured graph_evaluated_or_captured = true; // CUDA graph has been captured
} else { } else {
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
@ -2540,72 +2564,91 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
} }
// Perform update to graph (if required for this token), and change copy parameter (required for every token) // Perform update to graph (if required for this token), and change copy parameter (required for every token)
maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
if (cuda_graph_update_required) {
// Extract nodes from graph
// First call with null argument gets number of nodes in graph
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
// Subsequent call with non-null argument gets nodes
cuda_ctx->cuda_graph->nodes.clear();
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
cuda_ctx->cuda_graph->params.clear();
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
if (cuda_ctx->cuda_graph->num_nodes > 0) {
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
// Loop over nodes, and extract kernel parameters from each node
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
cudaGraphNodeType node_type;
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
if (node_type == cudaGraphNodeTypeKernel) {
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
if (stat == cudaErrorInvalidDeviceFunction) {
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
// We don't need to update blas nodes, so clear error and move on.
cudaGetLastError();
} else {
GGML_ASSERT(stat == cudaSuccess);
}
}
}
}
}
// One of the arguments to the copy kernel is updated for each token, hence we need to
// replace that argument with the updated value in the CUDA graph
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
int k = 0;
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
}
}
}
// Update graph executable // Update graph executable
cudaGraphExecUpdateResultInfo result_info; update_cuda_graph_executable(cuda_ctx);
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
if (stat == cudaErrorGraphExecUpdateFailure) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
#endif
// The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate
cudaGetLastError();
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
cuda_ctx->cuda_graph->instance = nullptr;
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
} else {
GGML_ASSERT(stat == cudaSuccess);
}
// Launch graph // Launch graph
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
#else #else
graph_evaluated_or_captured = true; graph_evaluated_or_captured = true;
#endif // USE_CUDA_GRAPH #endif // USE_CUDA_GRAPH
} }
}
static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
// vector of pointers to CUDA cpy kernels, which are required to identify
// kernel parameters which need updated in the graph for each token
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
#ifdef USE_CUDA_GRAPH
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
// Objects required for CUDA Graph
if (cuda_ctx->cuda_graph == nullptr) {
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
}
bool use_cuda_graph = true;
bool cuda_graph_update_required = false;
if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
#endif
}
}
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
// or previous graph capture failure.
// Also disable for multi-gpu for now. TO DO investigate
if (disable_cuda_graphs_due_to_env
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
use_cuda_graph = false;
}
if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (use_cuda_graph && cuda_graph_update_required) {
cuda_ctx->cuda_graph->number_consecutive_updates++;
} else {
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
}
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif
}
}
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
}
#else
bool use_cuda_graph = false;
bool cuda_graph_update_required = false;
#endif // USE_CUDA_GRAPH
bool graph_evaluated_or_captured = false;
evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
@ -2881,7 +2924,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
} }
} break; } break;
case GGML_OP_OUT_PROD: case GGML_OP_OUT_PROD:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
{ {
switch (op->src[0]->type) { switch (op->src[0]->type) {
@ -2897,6 +2940,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return false; return false;
} }
} break; } break;
case GGML_OP_GET_ROWS_BACK:
{
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
} break;
case GGML_OP_CPY: case GGML_OP_CPY:
{ {
ggml_type src0_type = op->src[0]->type; ggml_type src0_type = op->src[0]->type;
@ -2970,8 +3017,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
} }
return false; return false;
} break; } break;
case GGML_OP_SILU_BACK:
return ggml_is_contiguous(op->src[0]);
break;
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0; return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
break; break;
case GGML_OP_NONE: case GGML_OP_NONE:
@ -2996,8 +3047,17 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
return true; return true;
case GGML_OP_SOFT_MAX_BACK: {
float max_bias = 0.0f;
memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
return max_bias == 0.0f;
}
case GGML_OP_ROPE: case GGML_OP_ROPE:
return ggml_is_contiguous(op->src[0]); case GGML_OP_ROPE_BACK: {
const size_t ts = ggml_type_size(op->src[0]->type);
const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
}
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM: case GGML_OP_SUM:
@ -3011,6 +3071,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV6:
case GGML_OP_GATED_LINEAR_ATTN:
return true; return true;
case GGML_OP_FLASH_ATTN_EXT: { case GGML_OP_FLASH_ATTN_EXT: {
#ifndef FLASH_ATTN_AVAILABLE #ifndef FLASH_ATTN_AVAILABLE
@ -3052,6 +3113,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
return op->ne[1]; return op->ne[1];
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
return op->ne[2]; return op->ne[2];
default: default:
return ggml_nrows(op); return ggml_nrows(op);

93
ggml/src/ggml-cuda/gla.cu Normal file
View file

@ -0,0 +1,93 @@
#include "common.cuh"
#include "gla.cuh"
template<int HEAD_SIZE>
static __global__ void gated_linear_attn_f32(const int B, const int T, const int C, const int H, const float scale,
const float * k, const float * v, const float * r, const float * td, const float * s, float * dst) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int head_size = HEAD_SIZE;
const int batch_i = bid / H;
const int head_i = bid % H;
const int state_size = C * head_size;
const int n_seq_tokens = T / B;
float state[head_size];
__shared__ float _k[head_size], _r[head_size], _td[head_size];
#pragma unroll
for (int i = 0; i < head_size; i++) {
state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
}
for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
__syncthreads();
_k[tid] = k[t];
_r[tid] = r[t];
_td[tid] = td[t];
__syncthreads();
const float _v = v[t];
float y = 0;
for (int j = 0; j < head_size; j += 4) {
const float4 & k = (float4 &)(_k[j]);
const float4 & r = (float4 &)(_r[j]);
const float4 & td = (float4 &)(_td[j]);
float4 & s = (float4 &)(state[j]);
float4 kv;
kv.x = k.x * _v;
kv.y = k.y * _v;
kv.z = k.z * _v;
kv.w = k.w * _v;
s.x = s.x * td.x + kv.x;
s.y = s.y * td.y + kv.y;
s.z = s.z * td.z + kv.z;
s.w = s.w * td.w + kv.w;
y += r.x * s.x;
y += r.y * s.y;
y += r.z * s.z;
y += r.w * s.w;
}
dst[t] = y * scale;
}
#pragma unroll
for (int i = 0; i < head_size; i++) {
dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
}
}
void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const float * k_d = (const float *)dst->src[0]->data;
const float * v_d = (const float *)dst->src[1]->data;
const float * r_d = (const float *)dst->src[2]->data;
const float * td_d = (const float *)dst->src[3]->data;
const float * s_d = (const float *)dst->src[4]->data;
const int64_t B = dst->src[4]->ne[1];
const int64_t T = dst->src[0]->ne[2];
const int64_t C = dst->ne[0];
const int64_t H = dst->src[0]->ne[1];
float scale;
memcpy(&scale, (float*)dst->op_params, sizeof(float));
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32);
GGML_ASSERT(C % H == 0);
GGML_ASSERT(C / H == 64 || C / H == 128);
if (C / H == 64) {
gated_linear_attn_f32<64><<<B * H, C / H, 0, stream>>>(B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
} else {
gated_linear_attn_f32<128><<<B * H, C / H, 0, stream>>>(B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
}
}

View file

@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_op_gated_linear_attn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -5,20 +5,24 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
const int row = blockIdx.x*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x; const int tid = threadIdx.x;
float2 mean_var = make_float2(0.f, 0.f); x += int64_t(row)*ncols;
dst += int64_t(row)*ncols;
float2 mean_var = make_float2(0.0f, 0.0f);
for (int col = tid; col < ncols; col += block_size) { for (int col = tid; col < ncols; col += block_size) {
const float xi = x[row*ncols + col]; const float xi = x[col];
mean_var.x += xi; mean_var.x += xi;
mean_var.y += xi * xi; mean_var.y += xi * xi;
} }
// sum up partial sums // sum up partial sums
mean_var = warp_reduce_sum(mean_var); mean_var = warp_reduce_sum(mean_var);
if (block_size > WARP_SIZE) { if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
__shared__ float2 s_sum[32]; __shared__ float2 s_sum[32];
int warp_id = threadIdx.x / WARP_SIZE; const int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) { if (lane_id == 0) {
s_sum[warp_id] = mean_var; s_sum[warp_id] = mean_var;
} }
@ -32,7 +36,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
const float inv_std = rsqrtf(var + eps); const float inv_std = rsqrtf(var + eps);
for (int col = tid; col < ncols; col += block_size) { for (int col = tid; col < ncols; col += block_size) {
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; dst[col] = (x[col] - mean) * inv_std;
} }
} }
@ -40,14 +44,8 @@ template <int block_size>
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
// blockIdx.x: num_groups idx // blockIdx.x: num_groups idx
// threadIdx.x: block_size idx // threadIdx.x: block_size idx
int start = blockIdx.x * group_size; const int start = blockIdx.x*group_size + threadIdx.x;
int end = start + group_size; const int end = min(blockIdx.x*group_size + group_size, ne_elements);
start += threadIdx.x;
if (end >= ne_elements) {
end = ne_elements;
}
float tmp = 0.0f; // partial sum for thread in warp float tmp = 0.0f; // partial sum for thread in warp
@ -56,10 +54,11 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
} }
tmp = warp_reduce_sum(tmp); tmp = warp_reduce_sum(tmp);
if (block_size > WARP_SIZE) { if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
__shared__ float s_sum[32]; __shared__ float s_sum[32];
int warp_id = threadIdx.x / WARP_SIZE; const int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) { if (lane_id == 0) {
s_sum[warp_id] = tmp; s_sum[warp_id] = tmp;
} }
@ -68,11 +67,11 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
tmp = warp_reduce_sum(tmp); tmp = warp_reduce_sum(tmp);
} }
float mean = tmp / group_size; const float mean = tmp / group_size;
tmp = 0.0f; tmp = 0.0f;
for (int j = start; j < end; j += block_size) { for (int j = start; j < end; j += block_size) {
float xi = x[j] - mean; const float xi = x[j] - mean;
dst[j] = xi; dst[j] = xi;
tmp += xi * xi; tmp += xi * xi;
} }
@ -80,8 +79,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
tmp = warp_reduce_sum(tmp); tmp = warp_reduce_sum(tmp);
if (block_size > WARP_SIZE) { if (block_size > WARP_SIZE) {
__shared__ float s_sum[32]; __shared__ float s_sum[32];
int warp_id = threadIdx.x / WARP_SIZE; const int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) { if (lane_id == 0) {
s_sum[warp_id] = tmp; s_sum[warp_id] = tmp;
} }
@ -90,8 +89,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
tmp = warp_reduce_sum(tmp); tmp = warp_reduce_sum(tmp);
} }
float variance = tmp / group_size; const float variance = tmp / group_size;
float scale = rsqrtf(variance + eps); const float scale = rsqrtf(variance + eps);
for (int j = start; j < end; j += block_size) { for (int j = start; j < end; j += block_size) {
dst[j] *= scale; dst[j] *= scale;
} }
@ -102,19 +101,23 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
const int row = blockIdx.x*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x; const int tid = threadIdx.x;
x += int64_t(row)*ncols;
dst += int64_t(row)*ncols;
float tmp = 0.0f; // partial sum for thread in warp float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += block_size) { for (int col = tid; col < ncols; col += block_size) {
const float xi = x[row*ncols + col]; const float xi = x[col];
tmp += xi * xi; tmp += xi * xi;
} }
// sum up partial sums // sum up partial sums
tmp = warp_reduce_sum(tmp); tmp = warp_reduce_sum(tmp);
if (block_size > WARP_SIZE) { if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
__shared__ float s_sum[32]; __shared__ float s_sum[32];
int warp_id = threadIdx.x / WARP_SIZE; const int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) { if (lane_id == 0) {
s_sum[warp_id] = tmp; s_sum[warp_id] = tmp;
} }
@ -127,12 +130,63 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
const float scale = rsqrtf(mean + eps); const float scale = rsqrtf(mean + eps);
for (int col = tid; col < ncols; col += block_size) { for (int col = tid; col < ncols; col += block_size) {
dst[row*ncols + col] = scale * x[row*ncols + col]; dst[col] = scale * x[col];
}
}
template <int block_size>
static __global__ void rms_norm_back_f32(
const float * grad, const float * xf, float * dst, const int ncols, const float eps) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
grad += int64_t(row)*ncols;
xf += int64_t(row)*ncols;
dst += int64_t(row)*ncols;
float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass
float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs
for (int col = tid; col < ncols; col += block_size) {
const float xfi = xf[col];
sum_xx += xfi * xfi;
sum_xg += xfi * grad[col];
}
// sum up partial sums
sum_xx = warp_reduce_sum(sum_xx);
sum_xg = warp_reduce_sum(sum_xg);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
__shared__ float s_sum_xx[32];
__shared__ float s_sum_xg[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum_xx[warp_id] = sum_xx;
s_sum_xg[warp_id] = sum_xg;
}
__syncthreads();
sum_xx = s_sum_xx[lane_id];
sum_xx = warp_reduce_sum(sum_xx);
sum_xg = s_sum_xg[lane_id];
sum_xg = warp_reduce_sum(sum_xg);
}
const float mean_eps = sum_xx / ncols + eps;
const float sum_eps = sum_xx + ncols*eps;
const float scale_grad = rsqrtf(mean_eps);
const float scale_x = -scale_grad * sum_xg/sum_eps;
for (int col = tid; col < ncols; col += block_size) {
dst[col] = scale_grad*grad[col] + scale_x*xf[col];
} }
} }
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) { if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps); norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
@ -142,7 +196,8 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
} }
} }
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) { static void group_norm_f32_cuda(
const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
if (group_size < 1024) { if (group_size < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps); group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
@ -153,7 +208,6 @@ static void group_norm_f32_cuda(const float * x, float * dst, const int num_grou
} }
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) { if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps); rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
@ -163,6 +217,16 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
} }
} }
static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_back_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(grad, xf, dst, ncols, eps);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_back_f32<1024><<<nrows, block_dims, 0, stream>>>(grad, xf, dst, ncols, eps);
}
}
void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
@ -179,6 +243,7 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream); norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
} }
@ -198,6 +263,7 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float eps; float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float)); memcpy(&eps, dst->op_params + 1, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream); group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
@ -219,6 +285,33 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float eps; float eps;
memcpy(&eps, dst->op_params, sizeof(float)); memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream); rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
} }
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * grad = dst->src[0]; // gradients
const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass
const float * grad_d = (const float *) grad->data;
const float * src0f_d = (const float *) src0f->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(grad));
GGML_ASSERT( grad->type == GGML_TYPE_F32);
GGML_ASSERT(src0f->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0f->ne[0];
const int64_t nrows = ggml_nrows(src0f);
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
rms_norm_back_f32_cuda(grad_d, src0f_d, dst_d, ne00, nrows, eps, stream);
}

View file

@ -5,3 +5,5 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -11,16 +11,15 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ne01 == ne11); GGML_ASSERT(ne01 == ne11);
GGML_ASSERT(ne0 == ne00); GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10); GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 == src0->ne[2]); GGML_ASSERT(ne2 % src0->ne[2] == 0);
GGML_ASSERT(ne3 % src0->ne[3] == 0);
GGML_ASSERT(ne2 == src1->ne[2]); GGML_ASSERT(ne2 == src1->ne[2]);
GGML_ASSERT(ne3 == src0->ne[3]);
GGML_ASSERT(ne3 == src1->ne[3]); GGML_ASSERT(ne3 == src1->ne[3]);
const float * src0_d = (const float *) src0->data; const float * src0_d = (const float *) src0->data;
@ -33,8 +32,6 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const float alpha = 1.0f; const float alpha = 1.0f;
const float beta = 0.0f; const float beta = 0.0f;
GGML_ASSERT(ne2 == 1);
GGML_ASSERT(ne3 == 1);
CUBLAS_CHECK(cublasSetStream(handle, stream)); CUBLAS_CHECK(cublasSetStream(handle, stream));
const bool src1_T = ggml_is_transposed(src1); const bool src1_T = ggml_is_transposed(src1);
@ -42,10 +39,27 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float)); GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
// data strides in dimensions 2/3
const size_t s02 = nb02 / sizeof(float);
const size_t s03 = nb03 / sizeof(float);
const size_t s12 = nb12 / sizeof(float);
const size_t s13 = nb13 / sizeof(float);
const size_t s2 = nb2 / sizeof(float);
const size_t s3 = nb3 / sizeof(float);
// dps == dst per src0, used for group query attention
const int64_t dps2 = ne2 / ne02;
const int64_t dps3 = ne3 / ne03;
// TODO batched matrix multiplication
for (int64_t i3 = 0; i3 < ne3; ++i3) {
for (int64_t i2 = 0; i2 < ne2; ++i2) {
CUBLAS_CHECK( CUBLAS_CHECK(
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op, cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
ne0, ne1, ne01, ne0, ne1, ne01,
&alpha, src0_d, ne00, &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, ne00,
src1_d, ldb, src1_d + i3 *s13 + i2 *s12, ldb,
&beta, dst_d, ne0)); &beta, dst_d + i3 *s3 + i2 *s2, ne0));
}
}
} }

View file

@ -16,9 +16,10 @@ static __device__ float rope_yarn_ramp(const float low, const float high, const
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
template<bool forward>
static __device__ void rope_yarn( static __device__ void rope_yarn(
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, const float theta_extrap, const float freq_scale, const rope_corr_dims corr_dims, const int64_t i0, const float ext_factor,
float * cos_theta, float * sin_theta) { float mscale, float & cos_theta, float & sin_theta) {
// Get n-d rotational scaling corrected for extrapolation // Get n-d rotational scaling corrected for extrapolation
float theta_interp = freq_scale * theta_extrap; float theta_interp = freq_scale * theta_extrap;
float theta = theta_interp; float theta = theta_interp;
@ -29,24 +30,28 @@ static __device__ void rope_yarn(
// Get n-d magnitude scaling corrected for interpolation // Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
} }
*cos_theta = cosf(theta) * mscale; cos_theta = cosf(theta) * mscale;
*sin_theta = sinf(theta) * mscale; sin_theta = sinf(theta) * mscale;
if (!forward) {
sin_theta *= -1.0f;
}
} }
template<typename T, bool has_ff> template<bool forward, bool has_ff, typename T>
static __global__ void rope_norm( static __global__ void rope_norm(
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) { const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) { if (i0 >= ne0) {
return; return;
} }
const int row = blockDim.x*blockIdx.x + threadIdx.x; const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= n_dims) { if (i0 >= n_dims) {
const int i = row*ne0 + i0; const int i = row_dst*ne0 + i0;
dst[i + 0] = x[i + 0]; dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1]; dst[i + 1] = x[i + 1];
@ -54,39 +59,43 @@ static __global__ void rope_norm(
return; return;
} }
const int i = row*ne0 + i0; const int row_x = row_dst % ne1;
const int i2 = row/p_delta_rows; const int channel_x = row_dst / ne1;
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); const int idst = row_dst*ne0 + i0;
const int ix = channel_x*s2 + row_x*s1 + i0;
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
float cos_theta; float cos_theta;
float sin_theta; float sin_theta;
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
const float x0 = x[i + 0]; const float x0 = x[ix + 0];
const float x1 = x[i + 1]; const float x1 = x[ix + 1];
dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[idst + 0] = x0*cos_theta - x1*sin_theta;
dst[i + 1] = x0*sin_theta + x1*cos_theta; dst[idst + 1] = x0*sin_theta + x1*cos_theta;
} }
template<typename T, bool has_ff> template<bool forward, bool has_ff, typename T>
static __global__ void rope_neox( static __global__ void rope_neox(
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) { const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) { if (i0 >= ne0) {
return; return;
} }
const int row = blockDim.x*blockIdx.x + threadIdx.x; const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= n_dims) { if (i0 >= n_dims) {
const int i = row*ne0 + i0; const int i = row_dst*ne0 + i0;
dst[i + 0] = x[i + 0]; dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1]; dst[i + 1] = x[i + 1];
@ -94,39 +103,43 @@ static __global__ void rope_neox(
return; return;
} }
const int i = row*ne0 + i0/2; const int row_x = row_dst % ne1;
const int i2 = row/p_delta_rows; const int channel_x = row_dst / ne1;
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f); const int idst = row_dst*ne0 + i0/2;
const int ix = channel_x*s2 + row_x*s1 + i0/2;
const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
float cos_theta; float cos_theta;
float sin_theta; float sin_theta;
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
const float x0 = x[i + 0]; const float x0 = x[ix + 0];
const float x1 = x[i + n_dims/2]; const float x1 = x[ix + n_dims/2];
dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[idst + 0] = x0*cos_theta - x1*sin_theta;
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
} }
template<typename T, bool has_ff> template<bool forward, bool has_ff, typename T>
static __global__ void rope_multi( static __global__ void rope_multi(
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) { const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) { if (i0 >= ne0) {
return; return;
} }
const int row = blockDim.x*blockIdx.x + threadIdx.x; const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= n_dims) { if (i0 >= n_dims) {
const int i = row*ne0 + i0; const int i = row_dst*ne0 + i0;
dst[i + 0] = x[i + 0]; dst[i + 0] = x[i + 0];
dst[i + 1] = x[i + 1]; dst[i + 1] = x[i + 1];
@ -134,25 +147,28 @@ static __global__ void rope_multi(
return; return;
} }
const int i = row*ne0 + i0/2; const int row_x = row_dst % ne1;
const int i2 = row/p_delta_rows; const int channel_x = row_dst / ne1;
int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3]; const int idst = row_dst*ne0 + i0/2;
int sec_w = sections.v[1] + sections.v[0]; const int ix = channel_x*s2 + row_x*s1 + i0/2;
int sector = (i0 / 2) % sect_dims;
const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
const int sec_w = sections.v[1] + sections.v[0];
const int sector = (i0 / 2) % sect_dims;
float theta_base = 0.0; float theta_base = 0.0;
if (sector < sections.v[0]) { if (sector < sections.v[0]) {
theta_base = pos[i2]*powf(theta_scale, i0/2.0f); theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
} }
else if (sector >= sections.v[0] && sector < sec_w) { else if (sector >= sections.v[0] && sector < sec_w) {
theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f); theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
} }
else if (sector >= sec_w && sector < sec_w + sections.v[2]) { else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f); theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
} }
else if (sector >= sec_w + sections.v[2]) { else if (sector >= sec_w + sections.v[2]) {
theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f); theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
} }
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@ -160,42 +176,46 @@ static __global__ void rope_multi(
float cos_theta; float cos_theta;
float sin_theta; float sin_theta;
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
const float x0 = x[i + 0]; const float x0 = x[ix + 0];
const float x1 = x[i + n_dims/2]; const float x1 = x[ix + n_dims/2];
dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[idst + 0] = x0*cos_theta - x1*sin_theta;
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
} }
template<typename T, bool has_ff> template<bool forward, bool has_ff, typename T>
static __global__ void rope_vision( static __global__ void rope_vision(
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) { const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
const float theta_scale, const float * freq_factors, const mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) { if (i0 >= ne0) {
return; return;
} }
const int row = blockDim.x*blockIdx.x + threadIdx.x; const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;
const int i = row*ne0 + i0/2; const int row_x = row_dst % ne1;
const int i2 = row/p_delta_rows; // i2-th tokens const int channel_x = row_dst / ne1;
int sect_dims = sections.v[0] + sections.v[1]; const int idst = row_dst*ne0 + i0/2;
int sec_w = sections.v[1] + sections.v[0]; const int ix = channel_x*s2 + row_x*s1 + i0/2;
int sector = (i0 / 2) % sect_dims;
const int sect_dims = sections.v[0] + sections.v[1];
const int sec_w = sections.v[1] + sections.v[0];
const int sector = (i0 / 2) % sect_dims;
float theta_base = 0.0; float theta_base = 0.0;
if (sector < sections.v[0]) { if (sector < sections.v[0]) {
const int p = sector; const int p = sector;
theta_base = pos[i2]*powf(theta_scale, p); theta_base = pos[channel_x]*powf(theta_scale, p);
} }
else if (sector >= sections.v[0] && sector < sec_w) { else if (sector >= sections.v[0] && sector < sec_w) {
const int p = sector - sections.v[0]; const int p = sector - sections.v[0];
theta_base = pos[i2 + ne2]*powf(theta_scale, p); theta_base = pos[channel_x + ne2]*powf(theta_scale, p);
} }
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@ -203,19 +223,20 @@ static __global__ void rope_vision(
float cos_theta; float cos_theta;
float sin_theta; float sin_theta;
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta); rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);
const float x0 = x[i + 0]; const float x0 = x[ix + 0];
const float x1 = x[i + n_dims]; const float x1 = x[ix + n_dims];
dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[idst + 0] = x0*cos_theta - x1*sin_theta;
dst[i + n_dims] = x0*sin_theta + x1*cos_theta; dst[idst + n_dims] = x0*sin_theta + x1*cos_theta;
} }
template<typename T> template<bool forward, typename T>
static void rope_norm_cuda( static void rope_norm_cuda(
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0); GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@ -224,22 +245,21 @@ static void rope_norm_cuda(
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) { if (freq_factors == nullptr) {
rope_norm<T, false><<<block_nums, block_dims, 0, stream>>>( rope_norm<forward, false><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
theta_scale, freq_factors attn_factor, corr_dims, theta_scale, freq_factors);
);
} else { } else {
rope_norm<T, true><<<block_nums, block_dims, 0, stream>>>( rope_norm<forward, true><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
theta_scale, freq_factors attn_factor, corr_dims, theta_scale, freq_factors);
);
} }
} }
template<typename T> template<bool forward, typename T>
static void rope_neox_cuda( static void rope_neox_cuda(
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) { const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0); GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@ -248,22 +268,21 @@ static void rope_neox_cuda(
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) { if (freq_factors == nullptr) {
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>( rope_neox<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
theta_scale, freq_factors attn_factor, corr_dims, theta_scale, freq_factors);
);
} else { } else {
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>( rope_neox<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
theta_scale, freq_factors attn_factor, corr_dims, theta_scale, freq_factors);
);
} }
} }
template<typename T> template<bool forward, typename T>
static void rope_multi_cuda( static void rope_multi_cuda(
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) { const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0); GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@ -272,22 +291,21 @@ static void rope_multi_cuda(
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) { if (freq_factors == nullptr) {
rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>( rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
theta_scale, freq_factors, sections attn_factor, corr_dims, theta_scale, freq_factors, sections);
);
} else { } else {
rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>( rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
theta_scale, freq_factors, sections attn_factor, corr_dims, theta_scale, freq_factors, sections);
);
} }
} }
template<typename T> template<bool forward, typename T>
static void rope_vision_cuda( static void rope_vision_cuda(
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) { const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0); GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@ -298,80 +316,18 @@ static void rope_vision_cuda(
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
if (freq_factors == nullptr) { if (freq_factors == nullptr) {
rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>( rope_vision<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
theta_scale, freq_factors, sections attn_factor, corr_dims, theta_scale, freq_factors, sections);
);
} else { } else {
rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>( rope_vision<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
theta_scale, freq_factors, sections attn_factor, corr_dims, theta_scale, freq_factors, sections);
);
} }
} }
static void rope_norm_cuda_f16( template <bool forward>
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows, void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
rope_norm_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
}
static void rope_norm_cuda_f32(
const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
rope_norm_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
}
static void rope_neox_cuda_f16(
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
rope_neox_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
}
static void rope_neox_cuda_f32(
const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
) {
rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
}
static void rope_multi_cuda_f16(
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_multi_cuda_f32(
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_vision_cuda_f16(
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
static void rope_vision_cuda_f32(
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
) {
rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
}
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * src2 = dst->src[2]; const ggml_tensor * src2 = dst->src[2];
@ -382,7 +338,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type); GGML_ASSERT(src0->type == dst->type);
@ -392,6 +347,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t ne02 = src0->ne[2]; // num heads const int64_t ne02 = src0->ne[2]; // num heads
const int64_t nr = ggml_nrows(src0); const int64_t nr = ggml_nrows(src0);
const size_t s01 = src0->nb[1] / ggml_type_size(src0->type);
const size_t s02 = src0->nb[2] / ggml_type_size(src0->type);
//const int n_past = ((int32_t *) dst->op_params)[0]; //const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2]; const int mode = ((int32_t *) dst->op_params)[2];
@ -440,59 +398,59 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
// compute // compute
if (is_neox) { if (is_neox) {
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
rope_neox_cuda_f32( rope_neox_cuda<forward>(
(const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
attn_factor, corr_dims, freq_factors, stream freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
);
} else if (src0->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16) {
rope_neox_cuda_f16( rope_neox_cuda<forward>(
(const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
attn_factor, corr_dims, freq_factors, stream freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
);
} else { } else {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
} else if (is_mrope && !is_vision) { } else if (is_mrope && !is_vision) {
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
rope_multi_cuda_f32( rope_multi_cuda<forward>(
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
attn_factor, corr_dims, freq_factors, sections, stream freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
);
} else if (src0->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16) {
rope_multi_cuda_f16( rope_multi_cuda<forward>(
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
attn_factor, corr_dims, freq_factors, sections, stream freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
);
} else { } else {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
} else if (is_vision) { } else if (is_vision) {
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
rope_vision_cuda_f32( rope_vision_cuda<forward>(
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
attn_factor, corr_dims, freq_factors, sections, stream freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
);
} else if (src0->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16) {
rope_vision_cuda_f16( rope_vision_cuda<forward>(
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
attn_factor, corr_dims, freq_factors, sections, stream freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
);
} else { } else {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
} else { } else {
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
rope_norm_cuda_f32( rope_norm_cuda<forward>(
(const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
attn_factor, corr_dims, freq_factors, stream freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
);
} else if (src0->type == GGML_TYPE_F16) { } else if (src0->type == GGML_TYPE_F16) {
rope_norm_cuda_f16( rope_norm_cuda<forward>(
(const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
attn_factor, corr_dims, freq_factors, stream freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
);
} else { } else {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
} }
} }
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_rope_impl<true>(ctx, dst);
}
void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_rope_impl<false>(ctx, dst);
}

View file

@ -3,3 +3,5 @@
#define CUDA_ROPE_BLOCK_SIZE 256 #define CUDA_ROPE_BLOCK_SIZE 256
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -1,5 +1,7 @@
#include "common.cuh" #include "common.cuh"
#include "ggml.h"
#include "softmax.cuh" #include "softmax.cuh"
#include <cstdint>
template <typename T> template <typename T>
static __device__ __forceinline__ float t2f32(T val) { static __device__ __forceinline__ float t2f32(T val) {
@ -11,14 +13,20 @@ __device__ float __forceinline__ t2f32<half>(half val) {
return __half2float(val); return __half2float(val);
} }
template <bool vals_smem, int ncols_template, int block_size_template, typename T> template <bool use_shared, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { static __global__ void soft_max_f32(
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int rowx = blockIdx.x; const int rowx = blockIdx.x;
const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
x += int64_t(rowx)*ncols;
mask += int64_t(rowy)*ncols * (mask != nullptr);
dst += int64_t(rowx)*ncols;
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template; const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
const int warp_id = threadIdx.x / WARP_SIZE; const int warp_id = threadIdx.x / WARP_SIZE;
@ -29,7 +37,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, float * dst
extern __shared__ float data_soft_max_f32[]; extern __shared__ float data_soft_max_f32[];
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
// shared memory buffer to cache values between iterations: // shared memory buffer to cache values between iterations:
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols; float * vals = use_shared ? buf_iw + WARP_SIZE : dst;
float max_val = -INFINITY; float max_val = -INFINITY;
@ -41,10 +49,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, float * dst
break; break;
} }
const int64_t ix = (int64_t)rowx*ncols + col; const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
const int64_t iy = (int64_t)rowy*ncols + col;
const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f);
vals[col] = val; vals[col] = val;
max_val = max(max_val, val); max_val = max(max_val, val);
@ -110,8 +115,29 @@ static __global__ void soft_max_f32(const float * x, const T * mask, float * dst
return; return;
} }
const int64_t idst = (int64_t)rowx*ncols + col; dst[col] = vals[col] * inv_sum;
dst[idst] = vals[col] * inv_sum; }
}
static __global__ void soft_max_back_f32(
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
const int tid = threadIdx.x;
const int rowx = blockIdx.x;
grad += int64_t(rowx)*ncols;
dstf += int64_t(rowx)*ncols;
dst += int64_t(rowx)*ncols;
float dgf_dot = 0.0f; // dot product of dst from forward pass and gradients
for (int col = tid; col < ncols; col += WARP_SIZE) {
dgf_dot += dstf[col]*grad[col];
}
dgf_dot = warp_reduce_sum(dgf_dot);
for (int col = tid; col < ncols; col += WARP_SIZE) {
dst[col] = scale * (grad[col] - dgf_dot) * dstf[col];
} }
} }
@ -121,7 +147,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1); const dim3 block_dims(nth, 1, 1);
const dim3 block_nums(nrows_x, 1, 1); const dim3 block_nums(nrows_x, 1, 1);
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
const uint32_t n_head = nrows_x/nrows_y; const uint32_t n_head = nrows_x/nrows_y;
@ -131,50 +157,68 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// FIXME: this limit could be raised by ~2-4x on Ampere or newer // FIXME: this limit could be raised by ~2-4x on Ampere or newer
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
switch (ncols_x) { switch (ncols_x) {
case 32: case 32:
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 32, 32><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 64: case 64:
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 64, 64><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 128: case 128:
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 128, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 256: case 256:
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 256, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 512: case 512:
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 512, 512><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 1024: case 1024:
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 2048: case 2048:
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 4096: case 4096:
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
default: default:
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
} }
} else { } else {
const size_t shmem_low = WARP_SIZE*sizeof(float); const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
} }
} }
static void soft_max_back_f32_cuda(
const float * grad, const float * dstf, float * dst,
const int ncols, const int nrows, const float scale, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);
soft_max_back_f32<<<block_nums, block_dims, 0, stream>>>(grad, dstf, dst, ncols, scale);
}
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *) src0->data;
const void * src1_d = src1 ? (const void *)src1->data : nullptr; const void * src1_d = src1 ? (const void *) src1->data : nullptr;
float * dst_d = (float *) dst->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
@ -189,18 +233,42 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float scale = 1.0f; float scale = 1.0f;
float max_bias = 0.0f; float max_bias = 0.0f;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
if (use_f16) { if (use_f16) {
const half * src1_dd = (const half *)src1_d; soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} else { } else {
const float * src1_dd = (const float *)src1_d; soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} }
} }
void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // grad
const ggml_tensor * src1 = dst->src[1]; // forward pass output
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
GGML_ASSERT(max_bias == 0.0f);
soft_max_back_f32_cuda(src0_d, src1_d, dst_d, ncols, nrows, scale, stream);
}

View file

@ -3,3 +3,5 @@
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 #define CUDA_SOFT_MAX_BLOCK_SIZE 1024
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -51,6 +51,19 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
dst[i] = x[i] / (1.0f + expf(-x[i])); dst[i] = x[i] / (1.0f + expf(-x[i]));
} }
static __global__ void silu_back_f32(
const float * grad, const float * xf, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
const float xfi = xf[i];
const float s = 1.0f / (1.0f + expf(-xfi));
dst[i] = grad[i] * s * (1.0f + xfi * (1.0f - s));
}
static __global__ void tanh_f32(const float * x, float * dst, int k) { static __global__ void tanh_f32(const float * x, float * dst, int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -173,6 +186,11 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k); silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void silu_back_f32_cuda(const float * grad, const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
silu_back_f32<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
}
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE; const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k); tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -284,6 +302,24 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
} }
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // input from forward pass
const ggml_tensor * src1 = dst->src[1]; // grads of forward pass output
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
silu_back_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;

View file

@ -4,6 +4,7 @@
#define CUDA_STEP_BLOCK_SIZE 256 #define CUDA_STEP_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256 #define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_SILU_BACK_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256 #define CUDA_TANH_BLOCK_SIZE 256
#define CUDA_RELU_BLOCK_SIZE 256 #define CUDA_RELU_BLOCK_SIZE 256
#define CUDA_SIGMOID_BLOCK_SIZE 256 #define CUDA_SIGMOID_BLOCK_SIZE 256
@ -23,6 +24,8 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -73,9 +73,9 @@ void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
const float * s_d = (const float *)dst->src[5]->data; const float * s_d = (const float *)dst->src[5]->data;
const int64_t B = dst->src[5]->ne[1]; const int64_t B = dst->src[5]->ne[1];
const int64_t T = dst->src[0]->ne[3]; const int64_t T = dst->src[0]->ne[2];
const int64_t C = dst->ne[0]; const int64_t C = dst->ne[0];
const int64_t H = dst->src[0]->ne[2]; const int64_t H = dst->src[0]->ne[1];
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;

View file

@ -70,7 +70,9 @@ ggml_add_backend_library(ggml-hip
) )
# TODO: do not use CUDA definitions for HIP # TODO: do not use CUDA definitions for HIP
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) if (NOT GGML_BACKEND_DL)
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
endif()
add_compile_definitions(GGML_USE_HIP) add_compile_definitions(GGML_USE_HIP)

View file

@ -29,5 +29,6 @@
#include "wkv6.hpp" #include "wkv6.hpp"
#include "outprod.hpp" #include "outprod.hpp"
#include "element_wise.hpp" #include "element_wise.hpp"
#include "gla.hpp"
#endif // GGML_SYCL_BACKEND_HPP #endif // GGML_SYCL_BACKEND_HPP

View file

@ -51,6 +51,10 @@ void ggml_sycl_host_free(void* ptr) try {
std::exit(1); std::exit(1);
} }
bool gpu_has_xmx(sycl::device &dev) {
return dev.has(sycl::aspect::ext_intel_matrix);
}
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) { int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) {
const int64_t max_range = std::numeric_limits<int>::max(); const int64_t max_range = std::numeric_limits<int>::max();
int64_t sycl_down_blk_size = block_size; int64_t sycl_down_blk_size = block_size;

View file

@ -662,6 +662,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
} }
} }
bool gpu_has_xmx(sycl::device &dev);
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst, const ggml_tensor *src1, ggml_tensor *dst,

View file

@ -158,8 +158,9 @@ static void concat_f32_sycl_non_cont(
}); });
} }
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
const ggml_tensor *src1, ggml_tensor *dst) { const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
queue_ptr stream = ctx.stream(); queue_ptr stream = ctx.stream();
const int32_t dim = ((int32_t *)dst->op_params)[0]; const int32_t dim = ((int32_t *)dst->op_params)[0];

View file

@ -15,7 +15,6 @@
#include "common.hpp" #include "common.hpp"
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
const ggml_tensor *src1, ggml_tensor *dst);
#endif // GGML_SYCL_CONCAT_HPP #endif // GGML_SYCL_CONCAT_HPP

View file

@ -71,8 +71,9 @@ static void conv_transpose_1d_f32_f32_sycl(
}); });
} }
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
const ggml_tensor *src1, ggml_tensor *dst) { const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data; const float * src1_d = (const float *)src1->data;

View file

@ -15,7 +15,6 @@
#include "common.hpp" #include "common.hpp"
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
const ggml_tensor *src1, ggml_tensor *dst);
#endif // GGML_SYCL_CONV_HPP #endif // GGML_SYCL_CONV_HPP

View file

@ -882,149 +882,149 @@ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor
} }
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqrt); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqrt);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sin); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sin);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_cos); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_cos);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_acc); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_acc);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_silu); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_silu);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu_quick); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu_quick);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_tanh); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_tanh);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_relu); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_relu);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sigmoid); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sigmoid);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardsigmoid); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardsigmoid);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardswish); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardswish);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_exp); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_exp);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_log); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_log);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_neg); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_neg);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_step); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_step);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_leaky_relu); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_leaky_relu);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqr); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqr);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_upscale);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pad); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pad);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_add); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sub); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sub);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_mul); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_mul);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_div); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_div);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }

View file

@ -25,52 +25,52 @@ static __dpct_inline__ float op_div(const float a, const float b) {
} }
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_ELEMENTWISE_HPP #endif // GGML_SYCL_ELEMENTWISE_HPP

View file

@ -54,18 +54,12 @@ static ggml_sycl_device_info ggml_sycl_init() {
GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES); GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
#if defined(GGML_SYCL_FORCE_MMQ) /* This is a bit misleading; reserved for later */
GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__); // #if defined(SYCL_USE_XMX)
#else // GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: no\n", __func__); // #else
#endif // GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
#if defined(SYCL_USE_XMX) // #endif
GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
#else
GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
#endif
GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME);
for (int i = 0; i < info.device_count; ++i) { for (int i = 0; i < info.device_count; ++i) {
info.devices[i].vmm = 0; info.devices[i].vmm = 0;
dpct::device_info prop; dpct::device_info prop;
@ -109,11 +103,11 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
auto global_mem_size = prop.get_global_mem_size()/1000000; auto global_mem_size = prop.get_global_mem_size()/1000000;
std::string xmx = gpu_has_xmx(device) ? "yes" : "no";
GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(),
name.c_str(), version.c_str(), prop.get_max_compute_units(), name.c_str(), version.c_str(), prop.get_max_compute_units(),
prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str()); global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str(), xmx.c_str());
} }
void ggml_backend_sycl_print_sycl_devices() { void ggml_backend_sycl_print_sycl_devices() {
@ -124,16 +118,16 @@ void ggml_backend_sycl_print_sycl_devices() {
GGML_LOG_INFO( GGML_LOG_INFO(
"| | | | " "| | | | "
" |Max | |Max |Global | |\n"); " |Max | |Max |Global | | XMX |\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"| | | | " "| | | | "
" |compute|Max work|sub |mem | |\n"); " |compute|Max work|sub |mem | | or |\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"|ID| Device Type| " "|ID| Device Type| "
"Name|Version|units |group |group|size | Driver version|\n"); "Name|Version|units |group |group|size | Driver version| Tensor Cores |\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"|--|-------------------|---------------------------------------|------" "|--|-------------------|---------------------------------------|------"
"-|-------|--------|-----|-------|---------------------|\n"); "-|-------|--------|-----|-------|---------------------|--------------|\n");
for (int id = 0; id < device_count; ++id) { for (int id = 0; id < device_count; ++id) {
sycl::device device = dpct::dev_mgr::instance().get_device(id); sycl::device device = dpct::dev_mgr::instance().get_device(id);
@ -164,14 +158,18 @@ static void ggml_check_sycl() try {
static bool initialized = false; static bool initialized = false;
if (!initialized) { if (!initialized) {
GGML_LOG_INFO("[SYCL] call ggml_check_sycl\n"); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
GGML_LOG_INFO("%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug); GGML_LOG_INFO("GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
#if defined(GGML_SYCL_FORCE_MMQ)
#if defined(GGML_SYCL_F16) GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: yes\n");
GGML_LOG_INFO("%s: GGML_SYCL_F16: yes\n", __func__);
#else #else
GGML_LOG_INFO("%s: GGML_SYCL_F16: no\n", __func__); GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: no\n");
#endif
#if defined(GGML_SYCL_F16)
GGML_LOG_INFO("GGML_SYCL_F16: yes\n");
#else
GGML_LOG_INFO("GGML_SYCL_F16: no\n");
#endif #endif
/* NOT REMOVE, keep it for next optimize for XMX. /* NOT REMOVE, keep it for next optimize for XMX.
@ -1189,7 +1187,6 @@ std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(q
/// kernels /// kernels
typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
typedef void (*ggml_sycl_func_t)(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
typedef void (*ggml_sycl_op_mul_mat_t)( typedef void (*ggml_sycl_op_mul_mat_t)(
ggml_backend_sycl_context & ctx, ggml_backend_sycl_context & ctx,
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
@ -3171,33 +3168,33 @@ catch (sycl::exception const &exc) {
} }
static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_repeat); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_repeat);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_get_rows); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_get_rows);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_norm); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_norm);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rms_norm);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__); GGML_SYCL_DEBUG("call %s\n", __func__);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_group_norm); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm);
GGML_SYCL_DEBUG("call %s done\n", __func__); GGML_SYCL_DEBUG("call %s done\n", __func__);
} }
@ -3572,9 +3569,10 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
} }
} }
static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
const ggml_tensor *src1,
ggml_tensor *dst) try { ggml_tensor *dst) try {
const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers"); GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
const ggml_tensor *ids = dst->src[2]; const ggml_tensor *ids = dst->src[2];
@ -3740,12 +3738,12 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_scale); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_scale);
} }
static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_clamp); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp);
} }
static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@ -3787,7 +3785,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
} }
GGML_UNUSED(dst); GGML_UNUSED(dst);
} }
catch (sycl::exception const &exc) { catch (sycl::exception const &exc) {
@ -3796,59 +3793,52 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
// TODO: why do we pass dst as src1 here? // TODO: why do we pass dst as src1 here?
ggml_sycl_cpy(ctx, src0, dst, nullptr); ggml_sycl_cpy(ctx, dst->src[0], dst, nullptr);
GGML_UNUSED(src1);
} }
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_diag_mask_inf); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
} }
static void ggml_sycl_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_soft_max); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_soft_max);
} }
static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rope); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope);
} }
static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pool2d); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pool2d);
} }
static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_im2col); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_im2col);
} }
static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sum); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum);
} }
static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sum_rows); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum_rows);
} }
static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_argsort); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argsort);
} }
static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_argmax); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argmax);
} }
static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_UNUSED(src0);
GGML_UNUSED(src1);
GGML_UNUSED(dst);
GGML_UNUSED(ctx);
}
void ggml_sycl_set_main_device(const int main_device) try { void ggml_sycl_set_main_device(const int main_device) try {
if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) { if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
@ -3871,191 +3861,192 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * tensor) { bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
if (!g_sycl_loaded) return false; if (!g_sycl_loaded) return false;
ggml_sycl_func_t func; if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
ggml_sycl_set_peer_access(dst->src[1]->ne[1], ctx.device);
}
switch (tensor->op) { switch (dst->op) {
case GGML_OP_ARGMAX: case GGML_OP_ARGMAX:
func = ggml_sycl_argmax; ggml_sycl_argmax(ctx, dst);
break; break;
case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_1D:
func = ggml_sycl_op_conv_transpose_1d; ggml_sycl_op_conv_transpose_1d(ctx, dst);
break; break;
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
func = ggml_sycl_repeat; ggml_sycl_repeat(ctx, dst);
break; break;
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
func = ggml_sycl_get_rows; ggml_sycl_get_rows(ctx, dst);
break; break;
case GGML_OP_DUP: case GGML_OP_DUP:
func = ggml_sycl_dup; ggml_sycl_dup(ctx, dst);
break; break;
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_ADD1: // TODO: more efficient implementation case GGML_OP_ADD1: // TODO: more efficient implementation
func = ggml_sycl_add; ggml_sycl_add(ctx, dst);
break; break;
case GGML_OP_SUB: case GGML_OP_SUB:
func = ggml_sycl_sub; ggml_sycl_sub(ctx, dst);
break; break;
case GGML_OP_ACC: case GGML_OP_ACC:
func = ggml_sycl_acc; ggml_sycl_acc(ctx, dst);
break; break;
case GGML_OP_MUL: case GGML_OP_MUL:
func = ggml_sycl_mul; ggml_sycl_mul(ctx, dst);
break; break;
case GGML_OP_LOG: case GGML_OP_LOG:
func = ggml_sycl_log; ggml_sycl_log(ctx, dst);
break; break;
case GGML_OP_DIV: case GGML_OP_DIV:
func = ggml_sycl_div; ggml_sycl_div(ctx, dst);
break; break;
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(tensor)) { switch (ggml_get_unary_op(dst)) {
case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_NEG:
func = ggml_sycl_neg; ggml_sycl_neg(ctx, dst);
break; break;
case GGML_UNARY_OP_STEP: case GGML_UNARY_OP_STEP:
func = ggml_sycl_step; ggml_sycl_step(ctx, dst);
break; break;
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
func = ggml_sycl_gelu; ggml_sycl_gelu(ctx, dst);
break; break;
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
func = ggml_sycl_silu; ggml_sycl_silu(ctx, dst);
break; break;
case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_QUICK:
func = ggml_sycl_gelu_quick; ggml_sycl_gelu_quick(ctx, dst);
break; break;
case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_TANH:
func = ggml_sycl_tanh; ggml_sycl_tanh(ctx, dst);
break; break;
case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_RELU:
func = ggml_sycl_relu; ggml_sycl_relu(ctx, dst);
break; break;
case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_SIGMOID:
func = ggml_sycl_sigmoid; ggml_sycl_sigmoid(ctx, dst);
break; break;
case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_HARDSIGMOID:
func = ggml_sycl_hardsigmoid; ggml_sycl_hardsigmoid(ctx, dst);
break; break;
case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_HARDSWISH:
func = ggml_sycl_hardswish; ggml_sycl_hardswish(ctx, dst);
break; break;
case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_EXP:
func = ggml_sycl_exp; ggml_sycl_exp(ctx, dst);
break; break;
default: default:
return false; return false;
} }
break; break;
case GGML_OP_NORM: case GGML_OP_NORM:
func = ggml_sycl_norm; ggml_sycl_norm(ctx, dst);
break; break;
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
func = ggml_sycl_group_norm; ggml_sycl_group_norm(ctx, dst);
break; break;
case GGML_OP_CONCAT: case GGML_OP_CONCAT:
func = ggml_sycl_op_concat; ggml_sycl_op_concat(ctx, dst);
break; break;
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
func = ggml_sycl_upscale; ggml_sycl_upscale(ctx, dst);
break; break;
case GGML_OP_PAD: case GGML_OP_PAD:
func = ggml_sycl_pad; ggml_sycl_pad(ctx, dst);
break; break;
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
func = ggml_sycl_leaky_relu; ggml_sycl_leaky_relu(ctx, dst);
break; break;
case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM:
func = ggml_sycl_rms_norm; ggml_sycl_rms_norm(ctx, dst);
break; break;
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
return false; return false;
} }
func = ggml_sycl_mul_mat; /* ggml_sycl_mul_mat_id is dependent on ggml_sycl_mul_mat */
ggml_sycl_mul_mat(ctx, dst->src[0], dst->src[1], dst);
break; break;
case GGML_OP_MUL_MAT_ID: case GGML_OP_MUL_MAT_ID:
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
return false; return false;
} }
func = ggml_sycl_mul_mat_id; ggml_sycl_mul_mat_id(ctx, dst);
break; break;
case GGML_OP_OUT_PROD: case GGML_OP_OUT_PROD:
func = ggml_sycl_op_out_prod; ggml_sycl_op_out_prod(ctx, dst);
break; break;
case GGML_OP_SCALE: case GGML_OP_SCALE:
func = ggml_sycl_scale; ggml_sycl_scale(ctx, dst);
break; break;
case GGML_OP_SQR: case GGML_OP_SQR:
func = ggml_sycl_sqr; ggml_sycl_sqr(ctx, dst);
break; break;
case GGML_OP_SQRT: case GGML_OP_SQRT:
func = ggml_sycl_sqrt; ggml_sycl_sqrt(ctx, dst);
break; break;
case GGML_OP_SIN: case GGML_OP_SIN:
func = ggml_sycl_sin; ggml_sycl_sin(ctx, dst);
break; break;
case GGML_OP_COS: case GGML_OP_COS:
func = ggml_sycl_cos; ggml_sycl_cos(ctx, dst);
break; break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
func = ggml_sycl_clamp; ggml_sycl_clamp(ctx, dst);
break; break;
case GGML_OP_CPY: case GGML_OP_CPY:
func = ggml_sycl_cpy; ggml_sycl_cpy(ctx, dst->src[0], dst->src[1], dst);
break; break;
case GGML_OP_CONT: case GGML_OP_CONT:
func = ggml_sycl_dup; ggml_sycl_dup(ctx, dst);
break; break;
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE: case GGML_OP_TRANSPOSE:
func = ggml_sycl_nop; GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__);
break; break;
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
func = ggml_sycl_diag_mask_inf; ggml_sycl_diag_mask_inf(ctx, dst);
break; break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
func = ggml_sycl_soft_max; ggml_sycl_soft_max(ctx, dst);
break; break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
func = ggml_sycl_rope; ggml_sycl_rope(ctx, dst);
break; break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
func = ggml_sycl_im2col; ggml_sycl_im2col(ctx, dst);
break; break;
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
func = ggml_sycl_pool2d; ggml_sycl_pool2d(ctx, dst);
break; break;
case GGML_OP_SUM: case GGML_OP_SUM:
func = ggml_sycl_sum; ggml_sycl_sum(ctx, dst);
break; break;
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
func = ggml_sycl_sum_rows; ggml_sycl_sum_rows(ctx, dst);
break; break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
func = ggml_sycl_argsort; ggml_sycl_argsort(ctx, dst);
break; break;
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
func = ggml_sycl_op_timestep_embedding; ggml_sycl_op_timestep_embedding(ctx, dst);
break; break;
case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV6:
func = ggml_sycl_op_rwkv_wkv6; ggml_sycl_op_rwkv_wkv6(ctx, dst);
break;
case GGML_OP_GATED_LINEAR_ATTN:
ggml_sycl_op_gated_linear_attn(ctx, dst);
break; break;
default: default:
return false; return false;
} }
if (tensor->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(tensor->src[0]->buffer)) {
ggml_sycl_set_peer_access(tensor->src[1]->ne[1], ctx.device);
}
func(ctx, tensor->src[0], tensor->src[1], tensor);
return true; return true;
} }
@ -4519,6 +4510,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV6:
case GGML_OP_GATED_LINEAR_ATTN:
return true; return true;
default: default:
return false; return false;

105
ggml/src/ggml-sycl/gla.cpp Normal file
View file

@ -0,0 +1,105 @@
#include <sycl/sycl.hpp>
#include "common.hpp"
template <u_int HEAD_SIZE>
static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B, u_int T, u_int C, u_int H, float scale,
const float * k, const float * v, const float * r, const float * td,
const float * s, float * dst) {
const u_int head_size = HEAD_SIZE;
const u_int state_size = C * head_size;
const u_int n_seq_tokens = T / B;
sycl::range<1> block_dims((C / H));
sycl::range<1> grid_dims((B * H));
stream->submit([&](sycl::handler & cgh) {
/* local memory accessors*/
auto _k = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
auto _r = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
u_int tid = item.get_local_id(0);
u_int bid = item.get_group(0);
u_int batch_i = bid / H;
u_int head_i = bid % H;
float state[head_size];
#pragma unroll
for (u_int i = 0; i < head_size; i++) {
state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
}
for (u_int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
item.barrier(sycl::access::fence_space::local_space); //sync threads
_k[tid] = k[t];
_r[tid] = r[t];
_td[tid] = td[t];
item.barrier(sycl::access::fence_space::local_space); //sync threads
const float _v = v[t];
float y = 0;
for (u_int j = 0; j < head_size; j += 4) {
const sycl::float4 & k = (sycl::float4 &) (_k[j]);
const sycl::float4 & r = (sycl::float4 &) (_r[j]);
const sycl::float4 & td = (sycl::float4 &) (_td[j]);
sycl::float4 & s = (sycl::float4 &) (state[j]);
sycl::float4 kv;
kv.x() = k.x() * _v;
kv.y() = k.y() * _v;
kv.z() = k.z() * _v;
kv.w() = k.w() * _v;
s.x() = s.x() * td.x() + kv.x();
s.y() = s.y() * td.y() + kv.y();
s.z() = s.z() * td.z() + kv.z();
s.w() = s.w() * td.w() + kv.w();
y += r.x() * s.x();
y += r.y() * s.y();
y += r.z() * s.z();
y += r.w() * s.w();
}
dst[t] = y * scale;
}
#pragma unroll
for (u_int i = 0; i < head_size; i++) {
dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
}
});
});
}
void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const float * k_d = static_cast<const float *>(dst->src[0]->data);
const float * v_d = static_cast<const float *>(dst->src[1]->data);
const float * r_d = static_cast<const float *>(dst->src[2]->data);
const float * td_d = static_cast<const float *>(dst->src[3]->data);
const float * s_d = static_cast<const float *>(dst->src[4]->data);
const int64_t B = dst->src[4]->ne[1];
const int64_t T = dst->src[0]->ne[2];
const int64_t C = dst->ne[0];
const int64_t H = dst->src[0]->ne[1];
dpct::queue_ptr stream = ctx.stream();
GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32);
GGML_ASSERT(C % H == 0);
GGML_ASSERT(C / H == 64 || C / H == 128);
float scale;
memcpy(&scale, dst->op_params, sizeof(float));
float * dst_d = (float *) dst->data;
if (C / H == 64) {
gated_linear_attn_f32_kernel<64>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
} else {
gated_linear_attn_f32_kernel<128>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
}
}

View file

@ -0,0 +1,8 @@
#ifndef GGML_SYCL_GLA_HPP
#define GGML_SYCL_GLA_HPP
#include "common.hpp"
void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
#endif // GGML_SYCL_GLA_HPP

View file

@ -3,9 +3,9 @@
#include "outprod.hpp" #include "outprod.hpp"
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
const ggml_tensor* src1, ggml_tensor* dst) { const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);

View file

@ -3,8 +3,7 @@
#include "common.hpp" #include "common.hpp"
void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
const ggml_tensor* src1, ggml_tensor* dst);
#endif // GGML_SYCL_OUTPROD_HPP #endif // GGML_SYCL_OUTPROD_HPP

View file

@ -55,8 +55,9 @@ static void timestep_embedding_f32_sycl(
}); });
} }
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor *src1, ggml_tensor * dst) { const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data; float * dst_d = (float *)dst->data;
dpct::queue_ptr stream = ctx.stream(); dpct::queue_ptr stream = ctx.stream();

View file

@ -15,7 +15,6 @@
#include "common.hpp" #include "common.hpp"
void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
const ggml_tensor *src1, ggml_tensor * dst);
#endif // GGML_SYCL_TSEMBD_HPP #endif // GGML_SYCL_TSEMBD_HPP

View file

@ -95,8 +95,10 @@ static void rwkv_wkv_f32_kernel(
} }
} }
void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
const ggml_tensor* src1, ggml_tensor* dst) {
const ggml_tensor *src0 = dst->src[0];
const ggml_tensor *src1 = dst->src[1];
const float* k_d = (const float*)dst->src[0]->data; const float* k_d = (const float*)dst->src[0]->data;
const float* v_d = (const float*)dst->src[1]->data; const float* v_d = (const float*)dst->src[1]->data;
@ -107,9 +109,9 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
float* dst_d = (float*)dst->data; float* dst_d = (float*)dst->data;
const int64_t B = dst->src[5]->ne[1]; const int64_t B = dst->src[5]->ne[1];
const int64_t T = dst->src[0]->ne[3]; const int64_t T = dst->src[0]->ne[2];
const int64_t C = dst->ne[0]; const int64_t C = dst->ne[0];
const int64_t H = dst->src[0]->ne[2]; const int64_t H = dst->src[0]->ne[1];
GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
GGML_ASSERT(C % H == 0); GGML_ASSERT(C % H == 0);

View file

@ -3,8 +3,7 @@
#include "common.hpp" #include "common.hpp"
void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
const ggml_tensor *src1, ggml_tensor * dst);
#endif // GGML_SYCL_WKV6_HPP #endif // GGML_SYCL_WKV6_HPP

View file

@ -1,5 +1,20 @@
cmake_minimum_required(VERSION 3.19)
cmake_policy(SET CMP0114 NEW)
find_package(Vulkan COMPONENTS glslc REQUIRED) find_package(Vulkan COMPONENTS glslc REQUIRED)
function(detect_host_compiler)
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
find_program(HOST_C_COMPILER NAMES cl gcc clang NO_CMAKE_FIND_ROOT_PATH)
find_program(HOST_CXX_COMPILER NAMES cl g++ clang++ NO_CMAKE_FIND_ROOT_PATH)
else()
find_program(HOST_C_COMPILER NAMES gcc clang NO_CMAKE_FIND_ROOT_PATH)
find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH)
endif()
set(HOST_C_COMPILER "${HOST_C_COMPILER}" PARENT_SCOPE)
set(HOST_CXX_COMPILER "${HOST_CXX_COMPILER}" PARENT_SCOPE)
endfunction()
if (Vulkan_FOUND) if (Vulkan_FOUND)
message(STATUS "Vulkan found") message(STATUS "Vulkan found")
@ -73,19 +88,56 @@ if (Vulkan_FOUND)
add_compile_definitions(GGML_VULKAN_RUN_TESTS) add_compile_definitions(GGML_VULKAN_RUN_TESTS)
endif() endif()
if (NOT CMAKE_CROSSCOMPILING)
add_subdirectory(vulkan-shaders) add_subdirectory(vulkan-shaders)
if (MSVC)
foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
string(TOUPPER ${CONFIG} CONFIG)
set_target_properties(vulkan-shaders-gen PROPERTIES
RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
endforeach()
endif()
else()
if (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN)
set(HOST_CMAKE_TOOLCHAIN_FILE ${GGML_VULKAN_SHADERS_GEN_TOOLCHAIN})
else()
detect_host_compiler()
if (NOT HOST_C_COMPILER OR NOT HOST_CXX_COMPILER)
message(FATAL_ERROR "Host compiler not found")
else()
message(STATUS "Host compiler: ${HOST_C_COMPILER} ${HOST_CXX_COMPILER}")
endif()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/host-toolchain.cmake.in ${CMAKE_BINARY_DIR}/host-toolchain.cmake @ONLY)
set(HOST_CMAKE_TOOLCHAIN_FILE ${CMAKE_BINARY_DIR}/host-toolchain.cmake)
endif()
message(STATUS "vulkan-shaders-gen toolchain file: ${HOST_CMAKE_TOOLCHAIN_FILE}")
set (_ggml_vk_genshaders_cmd vulkan-shaders-gen) include(ExternalProject)
# Native build through ExternalProject_Add
ExternalProject_Add(
vulkan-shaders-gen
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders
CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${HOST_CMAKE_TOOLCHAIN_FILE}
-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}
BUILD_COMMAND ${CMAKE_COMMAND} --build .
INSTALL_COMMAND ${CMAKE_COMMAND} --install .
INSTALL_DIR ${CMAKE_BINARY_DIR}
)
ExternalProject_Add_StepTargets(vulkan-shaders-gen build install)
endif()
set (_ggml_vk_host_suffix $<IF:$<STREQUAL:${CMAKE_HOST_SYSTEM_NAME},Windows>,.exe,>)
set (_ggml_vk_genshaders_cmd ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/vulkan-shaders-gen${_ggml_vk_host_suffix})
set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp) set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp) set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders) set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv) set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp") file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
set (_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen)
if (NOT CMAKE_CROSSCOMPILING) if (CMAKE_CROSSCOMPILING)
set(_ggml_vk_genshaders_cmd "$<TARGET_FILE_DIR:vulkan-shaders-gen>/${_ggml_vk_genshaders_cmd}") set(_ggml_vk_shader_deps ${_ggml_vk_shader_deps} vulkan-shaders-gen-build vulkan-shaders-gen-install)
endif () endif()
add_custom_command( add_custom_command(
OUTPUT ${_ggml_vk_header} OUTPUT ${_ggml_vk_header}
@ -99,7 +151,7 @@ if (Vulkan_FOUND)
--target-cpp ${_ggml_vk_source} --target-cpp ${_ggml_vk_source}
--no-clean --no-clean
DEPENDS ${_ggml_vk_shader_deps} ${_ggml_vk_genshaders_cmd} DEPENDS ${_ggml_vk_shader_deps}
COMMENT "Generate vulkan shaders" COMMENT "Generate vulkan shaders"
) )

View file

@ -0,0 +1,15 @@
set(CMAKE_BUILD_TYPE Release)
set(CMAKE_C_FLAGS -O2)
set(CMAKE_CXX_FLAGS -O2)
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
set(CMAKE_C_COMPILER @HOST_C_COMPILER@)
set(CMAKE_CXX_COMPILER @HOST_CXX_COMPILER@)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY @CMAKE_RUNTIME_OUTPUT_DIRECTORY@)
if("@CMAKE_C_COMPILER_ID@" STREQUAL "MSVC")
foreach(CONFIG IN ITEMS DEBUG RELEASE MINSIZEREL RELWITHDEBINFO)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
endforeach()
endif()

View file

@ -228,6 +228,8 @@ struct vk_device_struct {
vk_pipeline pipeline_repeat_f32; vk_pipeline pipeline_repeat_f32;
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16; vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_norm_f32;
vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_group_norm_f32;
vk_pipeline pipeline_rms_norm_f32; vk_pipeline pipeline_rms_norm_f32;
@ -1965,6 +1967,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
@ -2277,6 +2293,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
if (device->subgroup_size_control) { if (device->subgroup_size_control) {
device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize; device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize; device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
device_extensions.push_back("VK_EXT_subgroup_size_control");
} }
device->subgroup_size_control = device->subgroup_size_control && device->subgroup_size_control = device->subgroup_size_control &&
@ -2285,7 +2302,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
if (device->subgroup_size_control) { if (device->subgroup_size_control) {
device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups; device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups;
device_extensions.push_back("VK_EXT_subgroup_size_control");
} }
#if defined(VK_KHR_cooperative_matrix) #if defined(VK_KHR_cooperative_matrix)
@ -3689,6 +3705,33 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_cpy_f16_f16; return ctx->device->pipeline_cpy_f16_f16;
} }
} }
if (src->type == GGML_TYPE_F32) {
switch (to) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return ctx->device->pipeline_cpy_f32_quant[to];
default:
break;
}
}
if (to == GGML_TYPE_F32) {
switch (src->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return ctx->device->pipeline_cpy_quant_f32[src->type];
default:
break;
}
}
std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl; std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
@ -5160,7 +5203,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
} }
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")"); std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
GGML_ASSERT(dst->buffer != nullptr); GGML_ASSERT(dst->buffer != nullptr);
const uint64_t ne00 = src0->ne[0]; const uint64_t ne00 = src0->ne[0];
@ -5633,9 +5676,9 @@ static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subc
} }
static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
const size_t seq_length = dst->src[0]->ne[3]; const size_t seq_length = dst->src[0]->ne[2];
const size_t n_embed = dst->ne[0]; const size_t n_embed = dst->ne[0];
const size_t n_heads = dst->src[0]->ne[2]; const size_t n_heads = dst->src[0]->ne[1];
const size_t n_seqs = dst->src[5]->ne[1]; const size_t n_seqs = dst->src[5]->ne[1];
ggml_vk_op_f32_rwkv6( ggml_vk_op_f32_rwkv6(
@ -7905,12 +7948,36 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
{ {
ggml_type src0_type = op->src[0]->type; ggml_type src0_type = op->src[0]->type;
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type; ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
if (src0_type == GGML_TYPE_F32) {
switch (src1_type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return true; return true;
default:
break;
} }
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { }
if (src1_type == GGML_TYPE_F32) {
switch (src0_type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return true; return true;
default:
break;
} }
}
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
return true; return true;
} }

Some files were not shown because too many files have changed in this diff Show more