Merge branch 'master' into vulkan
This commit is contained in:
commit
f84cf24925
54 changed files with 2143 additions and 983 deletions
|
@ -1,3 +1,6 @@
|
||||||
|
# TODO: there have been some issues with the workflow, so disabling for now
|
||||||
|
# https://github.com/ggerganov/llama.cpp/issues/7893
|
||||||
|
#
|
||||||
# Benchmark
|
# Benchmark
|
||||||
name: Benchmark
|
name: Benchmark
|
||||||
|
|
||||||
|
@ -129,6 +132,8 @@ jobs:
|
||||||
|
|
||||||
- name: Server bench
|
- name: Server bench
|
||||||
id: server_bench
|
id: server_bench
|
||||||
|
env:
|
||||||
|
HEAD_REF: ${{ github.head_ref || github.ref_name }}
|
||||||
run: |
|
run: |
|
||||||
set -eux
|
set -eux
|
||||||
|
|
||||||
|
@ -137,7 +142,7 @@ jobs:
|
||||||
python bench.py \
|
python bench.py \
|
||||||
--runner-label ${{ env.RUNNER_LABEL }} \
|
--runner-label ${{ env.RUNNER_LABEL }} \
|
||||||
--name ${{ github.job }} \
|
--name ${{ github.job }} \
|
||||||
--branch ${{ github.head_ref || github.ref_name }} \
|
--branch $HEAD_REF \
|
||||||
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
|
||||||
--scenario script.js \
|
--scenario script.js \
|
||||||
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
--duration ${{ github.event.inputs.duration || env.DURATION }} \
|
22
.github/workflows/build.yml
vendored
22
.github/workflows/build.yml
vendored
|
@ -47,7 +47,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -105,7 +105,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -222,7 +222,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -696,22 +696,20 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'rpc-x64'
|
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
|
||||||
- build: 'noavx-x64'
|
- build: 'noavx-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx2-x64'
|
- build: 'avx2-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx-x64'
|
- build: 'avx-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512-x64'
|
- build: 'avx512-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute-x64'
|
- build: 'kompute-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'vulkan-x64'
|
- build: 'vulkan-x64'
|
||||||
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'llvm-arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
|
|
|
@ -6,15 +6,13 @@ on:
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
- '.github/workflows/python-check-requirements.yml'
|
||||||
- 'scripts/check-requirements.sh'
|
- 'scripts/check-requirements.sh'
|
||||||
- 'convert*.py'
|
- 'convert*.py'
|
||||||
- 'requirements.txt'
|
- '**/requirements*.txt'
|
||||||
- 'requirements/*.txt'
|
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- '.github/workflows/python-check-requirements.yml'
|
- '.github/workflows/python-check-requirements.yml'
|
||||||
- 'scripts/check-requirements.sh'
|
- 'scripts/check-requirements.sh'
|
||||||
- 'convert*.py'
|
- 'convert*.py'
|
||||||
- 'requirements.txt'
|
- '**/requirements*.txt'
|
||||||
- 'requirements/*.txt'
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -763,6 +763,10 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
|
||||||
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
|
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef GGML_VULKAN_PERF
|
||||||
|
MK_CPPFLAGS += -DGGML_VULKAN_PERF
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef GGML_VULKAN_VALIDATE
|
ifdef GGML_VULKAN_VALIDATE
|
||||||
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -186,10 +186,12 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
|
|
||||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||||
|
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||||
|
|
||||||
**Infrastructure:**
|
**Infrastructure:**
|
||||||
|
|
||||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||||
|
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
|
||||||
|
|
||||||
**Games:**
|
**Games:**
|
||||||
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
||||||
|
|
|
@ -2702,12 +2702,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_should_add_bos_token(const llama_model * model) {
|
|
||||||
const int add_bos = llama_add_bos_token(model);
|
|
||||||
|
|
||||||
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -380,10 +380,6 @@ std::string llama_detokenize(
|
||||||
const std::vector<llama_token> & tokens,
|
const std::vector<llama_token> & tokens,
|
||||||
bool special = true);
|
bool special = true);
|
||||||
|
|
||||||
// Uses the value from the model metadata if possible, otherwise
|
|
||||||
// defaults to true when model type is SPM, otherwise false.
|
|
||||||
bool llama_should_add_bos_token(const llama_model * model);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
|
@ -369,6 +369,9 @@ namespace grammar_parser {
|
||||||
}
|
}
|
||||||
// Validate the state to ensure that all rules are defined
|
// Validate the state to ensure that all rules are defined
|
||||||
for (const auto & rule : state.rules) {
|
for (const auto & rule : state.rules) {
|
||||||
|
if (rule.empty()) {
|
||||||
|
throw std::runtime_error("Undefined rule");
|
||||||
|
}
|
||||||
for (const auto & elem : rule) {
|
for (const auto & elem : rule) {
|
||||||
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
||||||
// Ensure that the rule at that location exists
|
// Ensure that the rule at that location exists
|
||||||
|
|
|
@ -590,6 +590,12 @@ class Model:
|
||||||
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
||||||
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
||||||
res = "smollm"
|
res = "smollm"
|
||||||
|
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
|
||||||
|
# ref: https://huggingface.co/bigscience/bloom
|
||||||
|
res = "bloom"
|
||||||
|
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
|
||||||
|
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
|
||||||
|
res = "gpt3-finnish"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -893,7 +899,7 @@ class GPTNeoXModel(Model):
|
||||||
return tensors
|
return tensors
|
||||||
|
|
||||||
|
|
||||||
@Model.register("BloomForCausalLM")
|
@Model.register("BloomForCausalLM", "BloomModel")
|
||||||
class BloomModel(Model):
|
class BloomModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.BLOOM
|
model_arch = gguf.MODEL_ARCH.BLOOM
|
||||||
|
|
||||||
|
|
|
@ -94,6 +94,8 @@ models = [
|
||||||
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||||
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||||
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||||
|
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||||
|
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -271,7 +271,7 @@ struct tokenized_prompt {
|
||||||
size_t max_seq_len;
|
size_t max_seq_len;
|
||||||
|
|
||||||
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
|
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
|
||||||
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
|
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
|
||||||
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||||
|
|
|
@ -127,7 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool run(llama_context * ctx, const gpt_params & params) {
|
static bool run(llama_context * ctx, const gpt_params & params) {
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
|
|
|
@ -17,9 +17,9 @@ For example:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./bin/llama-export-lora \
|
./bin/llama-export-lora \
|
||||||
-m open-llama-3b-v2-q8_0.gguf \
|
-m open-llama-3b-v2.gguf \
|
||||||
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
-o open-llama-3b-v2-english2tokipona-chat.gguf \
|
||||||
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
|
--lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
|
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
|
||||||
|
|
|
@ -10,6 +10,12 @@
|
||||||
|
|
||||||
static bool g_verbose = false;
|
static bool g_verbose = false;
|
||||||
|
|
||||||
|
struct tensor_transformation {
|
||||||
|
struct ggml_tensor * in;
|
||||||
|
struct ggml_tensor * out;
|
||||||
|
bool is_copy;
|
||||||
|
};
|
||||||
|
|
||||||
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
|
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
|
||||||
int id = gguf_find_key(ctx_gguf, key.c_str());
|
int id = gguf_find_key(ctx_gguf, key.c_str());
|
||||||
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
|
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
|
||||||
|
@ -198,8 +204,7 @@ struct lora_merge_ctx {
|
||||||
}
|
}
|
||||||
|
|
||||||
// mapping base tensor to out tensor (same shape with base, but different type)
|
// mapping base tensor to out tensor (same shape with base, but different type)
|
||||||
// if out_tensor == nullptr, we only copy it
|
std::vector<tensor_transformation> trans;
|
||||||
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
|
|
||||||
for (auto & it : base_model.tensors) {
|
for (auto & it : base_model.tensors) {
|
||||||
bool t_a = true;
|
bool t_a = true;
|
||||||
bool t_b = true;
|
bool t_b = true;
|
||||||
|
@ -212,14 +217,22 @@ struct lora_merge_ctx {
|
||||||
// only copy
|
// only copy
|
||||||
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
||||||
ggml_set_name(cpy_tensor, base_tensor->name);
|
ggml_set_name(cpy_tensor, base_tensor->name);
|
||||||
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
|
trans.push_back({
|
||||||
|
cpy_tensor,
|
||||||
|
cpy_tensor,
|
||||||
|
true,
|
||||||
|
});
|
||||||
gguf_add_tensor(ctx_out, cpy_tensor);
|
gguf_add_tensor(ctx_out, cpy_tensor);
|
||||||
} else if (t_a && t_b) {
|
} else if (t_a && t_b) {
|
||||||
// need merging
|
// need merging
|
||||||
struct ggml_tensor * out_tensor = ggml_new_tensor(
|
struct ggml_tensor * out_tensor = ggml_new_tensor(
|
||||||
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
|
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
|
||||||
ggml_set_name(out_tensor, base_tensor->name);
|
ggml_set_name(out_tensor, base_tensor->name);
|
||||||
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
|
trans.push_back({
|
||||||
|
base_tensor,
|
||||||
|
out_tensor,
|
||||||
|
false,
|
||||||
|
});
|
||||||
gguf_add_tensor(ctx_out, out_tensor);
|
gguf_add_tensor(ctx_out, out_tensor);
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
|
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
|
||||||
|
@ -234,12 +247,12 @@ struct lora_merge_ctx {
|
||||||
|
|
||||||
// process base model tensors
|
// process base model tensors
|
||||||
size_t n_merged = 0;
|
size_t n_merged = 0;
|
||||||
for (auto & it : base_to_out_tensors) {
|
for (auto & it : trans) {
|
||||||
if (it.second != nullptr) {
|
if (!it.is_copy) {
|
||||||
merge_tensor(it.first, it.second);
|
merge_tensor(it.in, it.out);
|
||||||
n_merged++;
|
n_merged++;
|
||||||
} else {
|
} else {
|
||||||
copy_tensor(it.first);
|
copy_tensor(it.in);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -252,7 +265,7 @@ struct lora_merge_ctx {
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
|
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
|
||||||
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
|
printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void copy_tensor(struct ggml_tensor * base) {
|
void copy_tensor(struct ggml_tensor * base) {
|
||||||
|
@ -285,6 +298,10 @@ struct lora_merge_ctx {
|
||||||
for (size_t i = 0; i < adapters.size(); ++i) {
|
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||||
auto t_a = adapters[i]->get_tensor(name_lora_a);
|
auto t_a = adapters[i]->get_tensor(name_lora_a);
|
||||||
auto t_b = adapters[i]->get_tensor(name_lora_b);
|
auto t_b = adapters[i]->get_tensor(name_lora_b);
|
||||||
|
// TODO: add support for quantized lora
|
||||||
|
if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
|
||||||
|
throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
|
||||||
|
}
|
||||||
inp_a[i] = ggml_dup_tensor(ctx, t_a);
|
inp_a[i] = ggml_dup_tensor(ctx, t_a);
|
||||||
inp_b[i] = ggml_dup_tensor(ctx, t_b);
|
inp_b[i] = ggml_dup_tensor(ctx, t_b);
|
||||||
}
|
}
|
||||||
|
|
|
@ -433,8 +433,8 @@ static void process_logits(
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
|
@ -203,8 +203,8 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_add_bos_token(model);
|
||||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
GGML_ASSERT(!llama_add_eos_token(model));
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
|
@ -2,4 +2,4 @@
|
||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
pillow~=10.2.0
|
pillow~=10.2.0
|
||||||
torch~=2.2.1
|
torch~=2.2.1
|
||||||
torchvision==0.17.1
|
torchvision~=0.17.1
|
||||||
|
|
|
@ -267,9 +267,9 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_add_bos_token(model);
|
||||||
if (!llama_model_has_encoder(model)) {
|
if (!llama_model_has_encoder(model)) {
|
||||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
GGML_ASSERT(!llama_add_eos_token(model));
|
||||||
}
|
}
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
|
|
|
@ -340,8 +340,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||||
|
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
|
@ -480,8 +480,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||||
|
|
||||||
std::ofstream logits_stream;
|
std::ofstream logits_stream;
|
||||||
if (!params.logits_file.empty()) {
|
if (!params.logits_file.empty()) {
|
||||||
|
@ -1733,8 +1733,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
const int n_batch = params.n_batch;
|
const int n_batch = params.n_batch;
|
||||||
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
||||||
const int nv = 2*((n_vocab + 1)/2) + 4;
|
const int nv = 2*((n_vocab + 1)/2) + 4;
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||||
|
|
||||||
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
||||||
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
||||||
|
|
|
@ -253,6 +253,8 @@ int main(int argc, char ** argv) {
|
||||||
chunks[i].tokens.clear();
|
chunks[i].tokens.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
|
||||||
|
|
||||||
// start loop, receive query and return top k similar chunks based on cosine similarity
|
// start loop, receive query and return top k similar chunks based on cosine similarity
|
||||||
std::string query;
|
std::string query;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -260,7 +262,6 @@ int main(int argc, char ** argv) {
|
||||||
std::getline(std::cin, query);
|
std::getline(std::cin, query);
|
||||||
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
||||||
|
|
||||||
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
|
|
||||||
batch_add_seq(query_batch, query_tokens, 0);
|
batch_add_seq(query_batch, query_tokens, 0);
|
||||||
|
|
||||||
std::vector<float> query_emb(n_embd, 0);
|
std::vector<float> query_emb(n_embd, 0);
|
||||||
|
@ -293,6 +294,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
|
llama_batch_free(query_batch);
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -631,6 +631,7 @@ struct server_context {
|
||||||
|
|
||||||
bool clean_kv_cache = true;
|
bool clean_kv_cache = true;
|
||||||
bool add_bos_token = true;
|
bool add_bos_token = true;
|
||||||
|
bool has_eos_token = false;
|
||||||
|
|
||||||
int32_t n_ctx; // total context for all clients / slots
|
int32_t n_ctx; // total context for all clients / slots
|
||||||
|
|
||||||
|
@ -692,9 +693,8 @@ struct server_context {
|
||||||
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
add_bos_token = llama_should_add_bos_token(model);
|
add_bos_token = llama_add_bos_token(model);
|
||||||
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
has_eos_token = !llama_add_eos_token(model);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -753,13 +753,13 @@ struct server_context {
|
||||||
default_generation_settings_for_props = get_formated_generation(slots.front());
|
default_generation_settings_for_props = get_formated_generation(slots.front());
|
||||||
default_generation_settings_for_props["seed"] = -1;
|
default_generation_settings_for_props["seed"] = -1;
|
||||||
|
|
||||||
// the update_slots() logic will always submit a maximum of n_batch tokens
|
// the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
|
||||||
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
|
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
|
||||||
{
|
{
|
||||||
const int32_t n_batch = llama_n_batch(ctx);
|
const int32_t n_batch = llama_n_batch(ctx);
|
||||||
|
|
||||||
// only a single seq_id per token is needed
|
// only a single seq_id per token is needed
|
||||||
batch = llama_batch_init(n_batch, 0, 1);
|
batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics.init();
|
metrics.init();
|
||||||
|
@ -1031,7 +1031,7 @@ struct server_context {
|
||||||
{
|
{
|
||||||
slot.sparams.logit_bias.clear();
|
slot.sparams.logit_bias.clear();
|
||||||
|
|
||||||
if (json_value(data, "ignore_eos", false)) {
|
if (json_value(data, "ignore_eos", false) && has_eos_token) {
|
||||||
slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1136,28 +1136,19 @@ struct server_context {
|
||||||
if (!system_prompt.empty()) {
|
if (!system_prompt.empty()) {
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
|
||||||
|
|
||||||
for (int i = 0; i < (int)system_tokens.size(); ++i) {
|
|
||||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t n_batch = llama_n_batch(ctx);
|
const int32_t n_batch = llama_n_batch(ctx);
|
||||||
|
const int32_t n_tokens_prompt = system_tokens.size();
|
||||||
|
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
|
||||||
const int32_t n_tokens = std::min(params.n_batch, batch.n_tokens - i);
|
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
|
||||||
llama_batch batch_view = {
|
|
||||||
n_tokens,
|
|
||||||
batch.token + i,
|
|
||||||
nullptr,
|
|
||||||
batch.pos + i,
|
|
||||||
batch.n_seq_id + i,
|
|
||||||
batch.seq_id + i,
|
|
||||||
batch.logits + i,
|
|
||||||
0, 0, 0, // unused
|
|
||||||
};
|
|
||||||
|
|
||||||
if (llama_decode(ctx, batch_view) != 0) {
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
for (int32_t j = 0; j < n_tokens; ++j) {
|
||||||
|
llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_ERROR("llama_decode() failed", {});
|
LOG_ERROR("llama_decode() failed", {});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1330,7 +1321,7 @@ struct server_context {
|
||||||
|
|
||||||
return json {
|
return json {
|
||||||
{"n_ctx", slot.n_ctx},
|
{"n_ctx", slot.n_ctx},
|
||||||
{"n_predict", slot.n_predict},
|
{"n_predict", slot.n_predict}, // Server configured n_predict
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"seed", slot.sparams.seed},
|
{"seed", slot.sparams.seed},
|
||||||
{"temperature", slot.sparams.temp},
|
{"temperature", slot.sparams.temp},
|
||||||
|
@ -1352,7 +1343,7 @@ struct server_context {
|
||||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||||
{"penalize_nl", slot.sparams.penalize_nl},
|
{"penalize_nl", slot.sparams.penalize_nl},
|
||||||
{"stop", slot.params.antiprompt},
|
{"stop", slot.params.antiprompt},
|
||||||
{"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict
|
{"max_tokens", slot.params.n_predict}, // User configured n_predict
|
||||||
{"n_keep", slot.params.n_keep},
|
{"n_keep", slot.params.n_keep},
|
||||||
{"n_discard", slot.params.n_discard},
|
{"n_discard", slot.params.n_discard},
|
||||||
{"ignore_eos", ignore_eos},
|
{"ignore_eos", ignore_eos},
|
||||||
|
@ -1860,6 +1851,8 @@ struct server_context {
|
||||||
llama_lora_adapters_apply(ctx, lora_adapters);
|
llama_lora_adapters_apply(ctx, lora_adapters);
|
||||||
server_task_result result;
|
server_task_result result;
|
||||||
result.id = task.id;
|
result.id = task.id;
|
||||||
|
result.stop = true;
|
||||||
|
result.error = false;
|
||||||
result.data = json{{ "success", true }};
|
result.data = json{{ "success", true }};
|
||||||
queue_results.send(result);
|
queue_results.send(result);
|
||||||
} break;
|
} break;
|
||||||
|
@ -2044,7 +2037,7 @@ struct server_context {
|
||||||
slot.t_start_generation = 0;
|
slot.t_start_generation = 0;
|
||||||
|
|
||||||
if (slot.infill) {
|
if (slot.infill) {
|
||||||
const bool add_bos = llama_should_add_bos_token(model);
|
const bool add_bos = llama_add_bos_token(model);
|
||||||
bool suff_rm_leading_spc = true;
|
bool suff_rm_leading_spc = true;
|
||||||
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||||
params.input_suffix.erase(0, 1);
|
params.input_suffix.erase(0, 1);
|
||||||
|
|
|
@ -362,7 +362,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
prompt = stdin_buffer.str();
|
prompt = stdin_buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
const bool model_wants_add_bos = llama_add_bos_token(model);
|
||||||
const bool add_bos = model_wants_add_bos && !no_bos;
|
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||||
const bool parse_special = !no_parse_special;
|
const bool parse_special = !no_parse_special;
|
||||||
|
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1722421184,
|
"lastModified": 1723175592,
|
||||||
"narHash": "sha256-/DJBI6trCeVnasdjUo9pbnodCLZcFqnVZiLUfqLH4jA=",
|
"narHash": "sha256-M0xJ3FbDUc4fRZ84dPGx5VvgFsOzds77KiBMW/mMTnI=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "9f918d616c5321ad374ae6cb5ea89c9e04bf3e58",
|
"rev": "5e0ca22929f3342b19569b21b2f3462f053e497b",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -129,13 +129,13 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
||||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||||
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
||||||
|
|
||||||
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
|
|
||||||
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
||||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||||
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
||||||
|
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
|
||||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||||
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||||
|
|
|
@ -244,6 +244,8 @@
|
||||||
#define GGML_EXIT_SUCCESS 0
|
#define GGML_EXIT_SUCCESS 0
|
||||||
#define GGML_EXIT_ABORTED 1
|
#define GGML_EXIT_ABORTED 1
|
||||||
|
|
||||||
|
#define GGML_ROPE_TYPE_NEOX 2
|
||||||
|
|
||||||
#define GGUF_MAGIC "GGUF"
|
#define GGUF_MAGIC "GGUF"
|
||||||
|
|
||||||
#define GGUF_VERSION 3
|
#define GGUF_VERSION 3
|
||||||
|
@ -1453,8 +1455,8 @@ extern "C" {
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// rotary position embedding
|
// rotary position embedding
|
||||||
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
|
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
|
||||||
// if mode & 2 == 1, GPT-NeoX style
|
// if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
|
||||||
//
|
//
|
||||||
// b is an int32 vector with size a->ne[2], it contains the positions
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
||||||
GGML_API struct ggml_tensor * ggml_rope(
|
GGML_API struct ggml_tensor * ggml_rope(
|
||||||
|
|
|
@ -602,6 +602,10 @@ if (GGML_VULKAN)
|
||||||
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_VULKAN_PERF)
|
||||||
|
add_compile_definitions(GGML_VULKAN_PERF)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GGML_VULKAN_VALIDATE)
|
if (GGML_VULKAN_VALIDATE)
|
||||||
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
add_compile_definitions(GGML_VULKAN_VALIDATE)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -2881,7 +2881,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
|
||||||
beta_slow, corr_dims);
|
beta_slow, corr_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
// init cos/sin cache
|
// init cos/sin cache
|
||||||
ggml_cann_pool_alloc sin_allocator(
|
ggml_cann_pool_alloc sin_allocator(
|
||||||
|
|
|
@ -226,7 +226,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1_d;
|
const int32_t * pos = (const int32_t *) src1_d;
|
||||||
|
|
||||||
|
|
|
@ -2313,7 +2313,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
id<MTLComputePipelineState> pipeline = nil;
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
|
|
|
@ -226,7 +226,7 @@ void ggml_sycl_op_rope(
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1_dd;
|
const int32_t * pos = (const int32_t *) src1_dd;
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -14094,7 +14094,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
float corr_dims[2];
|
float corr_dims[2];
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
const float * freq_factors = NULL;
|
const float * freq_factors = NULL;
|
||||||
if (src2 != NULL) {
|
if (src2 != NULL) {
|
||||||
|
@ -14219,7 +14219,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
float corr_dims[2];
|
float corr_dims[2];
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
const float * freq_factors = NULL;
|
const float * freq_factors = NULL;
|
||||||
if (src2 != NULL) {
|
if (src2 != NULL) {
|
||||||
|
@ -21129,7 +21129,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
(int64_t) info->ne[2] *
|
(int64_t) info->ne[2] *
|
||||||
(int64_t) info->ne[3];
|
(int64_t) info->ne[3];
|
||||||
|
|
||||||
if (ne % ggml_blck_size(info->type) != 0) {
|
if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
|
||||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
||||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
||||||
fclose(file);
|
fclose(file);
|
||||||
|
|
|
@ -11,7 +11,7 @@ void main() {
|
||||||
const uint i2 = gl_WorkGroupID.y;
|
const uint i2 = gl_WorkGroupID.y;
|
||||||
const uint i1 = gl_WorkGroupID.x;
|
const uint i1 = gl_WorkGroupID.x;
|
||||||
|
|
||||||
const bool is_neox = (pcs.mode & 2) != 0;
|
const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
|
||||||
|
|
||||||
float corr_dims[2];
|
float corr_dims[2];
|
||||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||||
|
|
|
@ -11,7 +11,7 @@ void main() {
|
||||||
const uint i2 = gl_WorkGroupID.y;
|
const uint i2 = gl_WorkGroupID.y;
|
||||||
const uint i1 = gl_WorkGroupID.x;
|
const uint i1 = gl_WorkGroupID.x;
|
||||||
|
|
||||||
const bool is_neox = (pcs.mode & 2) != 0;
|
const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
|
||||||
|
|
||||||
float corr_dims[2];
|
float corr_dims[2];
|
||||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
#include "common.comp"
|
#include "common.comp"
|
||||||
|
|
||||||
|
#define GGML_ROPE_TYPE_NEOX 2
|
||||||
|
|
||||||
// TODO: use a local size of 32 or more (Metal uses 1024)
|
// TODO: use a local size of 32 or more (Metal uses 1024)
|
||||||
layout(local_size_x = 1) in;
|
layout(local_size_x = 1) in;
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,10 @@ void main() {
|
||||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
||||||
#else
|
#else
|
||||||
data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
|
if (is_src0) {
|
||||||
|
data_d[p.d_offset + dst_idx] = data_a[src0_idx];
|
||||||
|
} else {
|
||||||
|
data_d[p.d_offset + dst_idx] = data_b[src1_idx];
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,8 +39,7 @@ void main() {
|
||||||
vec2 v = dequantize(ib, iqs, a_offset / QUANT_K);
|
vec2 v = dequantize(ib, iqs, a_offset / QUANT_K);
|
||||||
|
|
||||||
// matrix multiplication
|
// matrix multiplication
|
||||||
tmp[tid] += FLOAT_TYPE(v.x) * FLOAT_TYPE(data_b[b_offset + iybs + iqs]) +
|
tmp[tid] = fma(FLOAT_TYPE(v.x), FLOAT_TYPE(data_b[b_offset + iybs + iqs]), fma(FLOAT_TYPE(v.y), FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]), tmp[tid]));
|
||||||
FLOAT_TYPE(v.y) * FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
|
|
@ -53,7 +53,7 @@ void main() {
|
||||||
|
|
||||||
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
const FLOAT_TYPE xi = FLOAT_TYPE(data_a[ix]);
|
||||||
|
|
||||||
tmp[tid] += xi * FLOAT_TYPE(data_b[iy]);
|
tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
|
|
@ -52,7 +52,7 @@ void main() {
|
||||||
// y is not transposed but permuted
|
// y is not transposed but permuted
|
||||||
const uint iy = channel*nrows_y + row_y;
|
const uint iy = channel*nrows_y + row_y;
|
||||||
|
|
||||||
tmp[tid] += xi * FLOAT_TYPE(data_b[iy]);
|
tmp[tid] = fma(xi, FLOAT_TYPE(data_b[iy]), tmp[tid]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// dst is not transposed and not permuted
|
// dst is not transposed and not permuted
|
||||||
|
|
|
@ -39,24 +39,25 @@ void main() {
|
||||||
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
|
||||||
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
|
||||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||||
sum1 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3)
|
sum1 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3);
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3), sum1))))))));
|
||||||
sum2 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF)
|
sum2 = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF);
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]), FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF), sum2))))))));
|
||||||
}
|
}
|
||||||
tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
|
const uint tmp_idx = 16 * ix + tid;
|
||||||
|
tmp[tmp_idx] = fma(dall, sum1, fma(-dmin, sum2, tmp[tmp_idx]));
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
|
|
@ -40,16 +40,17 @@ void main() {
|
||||||
|
|
||||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||||
sum += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4))
|
sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4))
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4))
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4))
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4))
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4))
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4))
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4));
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
|
||||||
}
|
}
|
||||||
tmp[16 * ix + tid] += d * sum;
|
const uint tmp_idx = 16 * ix + tid;
|
||||||
|
tmp[tmp_idx] = fma(d, sum, tmp[tmp_idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
|
|
@ -67,17 +67,17 @@ void main() {
|
||||||
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4);
|
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4);
|
||||||
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4);
|
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4);
|
||||||
|
|
||||||
const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1 + FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3);
|
const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx]), q4_0, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), q4_1, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3)));
|
||||||
const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_5 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7);
|
const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_4, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), q4_5, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), q4_6, FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7)));
|
||||||
const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx]) * q4_8 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_9 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * q4_10 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11);
|
const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx]), q4_8, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), q4_9, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), q4_10, FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11)));
|
||||||
const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_12 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_13 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * q4_14 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15);
|
const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_12, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), q4_13, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), q4_14, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15)));
|
||||||
const FLOAT_TYPE smin = FLOAT_TYPE(
|
const FLOAT_TYPE smin =
|
||||||
FLOAT_TYPE(data_b[b_offset + y1_idx ]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx ]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7,
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), sc7,
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * sc7
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 2]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 34]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 2]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 34]), sc7,
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 3]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 35]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 3]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7)))))))))))))));
|
||||||
);
|
const uint tmp_idx = 16 * ix + tid;
|
||||||
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
|
tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
|
||||||
#else
|
#else
|
||||||
const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf);
|
const uint8_t q4_0 = uint8_t(data_a[ib0 + i].qs[q_offset ] & 0xf);
|
||||||
const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf);
|
const uint8_t q4_1 = uint8_t(data_a[ib0 + i].qs[q_offset + 1] & 0xf);
|
||||||
|
@ -88,16 +88,19 @@ void main() {
|
||||||
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
|
const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4);
|
||||||
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
|
const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4);
|
||||||
|
|
||||||
const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx ]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1);
|
const FLOAT_TYPE sx = fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), q4_0, FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1);
|
||||||
const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3);
|
const FLOAT_TYPE sy = fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), q4_2, FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3);
|
||||||
const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx ]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5);
|
const FLOAT_TYPE sz = fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), q4_4, FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5);
|
||||||
const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7);
|
const FLOAT_TYPE sw = fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), q4_6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7);
|
||||||
const FLOAT_TYPE smin = FLOAT_TYPE(
|
const FLOAT_TYPE smin =
|
||||||
FLOAT_TYPE(data_b[b_offset + y1_idx]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), sc6, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), sc7,
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7
|
+ fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), sc2, fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), sc3, fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), sc6, FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7)))))));
|
||||||
);
|
|
||||||
|
|
||||||
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
|
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) +
|
||||||
|
sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin);
|
||||||
|
const uint tmp_idx = 16 * ix + tid;
|
||||||
|
tmp[tmp_idx] = fma(dall, (fma(sx, FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f), fma(sy, FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f),
|
||||||
|
fma(sz, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)), fma(sw, FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))))))), fma(-dmin, smin, tmp[tmp_idx]));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -66,35 +66,33 @@ void main() {
|
||||||
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4);
|
const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 80] >> 4);
|
||||||
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4);
|
const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4);
|
||||||
|
|
||||||
const FLOAT_TYPE sx = FLOAT_TYPE(
|
const FLOAT_TYPE sx =
|
||||||
FLOAT_TYPE(data_b[b_offset + y1_idx ]) * (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]), (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 1]), (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) * (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 16]), (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0))
|
FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0)))));
|
||||||
);
|
const FLOAT_TYPE sy =
|
||||||
const FLOAT_TYPE sy = FLOAT_TYPE(
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]), (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)),
|
||||||
FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 33]), (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 48]), (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) * (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0))
|
FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0)))));
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0))
|
const FLOAT_TYPE sz =
|
||||||
);
|
fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]), (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)),
|
||||||
const FLOAT_TYPE sz = FLOAT_TYPE(
|
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 1]), (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)),
|
||||||
FLOAT_TYPE(data_b[b_offset + y2_idx ]) * (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 16]), (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0))
|
FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0)))));
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) * (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0))
|
const FLOAT_TYPE sw =
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]), (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)),
|
||||||
);
|
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 33]), (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)),
|
||||||
const FLOAT_TYPE sw = FLOAT_TYPE(
|
fma(FLOAT_TYPE(data_b[b_offset + y2_idx + 48]), (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)),
|
||||||
FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0))
|
FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0)))));
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0))
|
const FLOAT_TYPE smin =
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) * (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]), sc2,
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0))
|
fma(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]), sc3,
|
||||||
);
|
fma(FLOAT_TYPE(data_b[b_offset + y2_idx ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1 ]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]), sc6,
|
||||||
const FLOAT_TYPE smin = FLOAT_TYPE(
|
(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7)));
|
||||||
(FLOAT_TYPE(data_b[b_offset + y1_idx]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17])) * sc2 + (FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49])) * sc3
|
const uint tmp_idx = 16 * ix + tid;
|
||||||
+ (FLOAT_TYPE(data_b[b_offset + y2_idx]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17])) * sc6 + (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7
|
tmp[tmp_idx] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, tmp[tmp_idx]));
|
||||||
);
|
|
||||||
tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
|
|
|
@ -44,22 +44,22 @@ void main() {
|
||||||
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
|
||||||
|
|
||||||
#if K_QUANTS_PER_ITERATION == 1
|
#if K_QUANTS_PER_ITERATION == 1
|
||||||
FLOAT_TYPE sum = FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32)
|
const uint tmp_idx = 16 * ix + tid;
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32)
|
tmp[tmp_idx] = fma(FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32);
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32),
|
||||||
tmp[16 * ix + tid] += sum;
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32), tmp[tmp_idx]))))))));
|
||||||
#else
|
#else
|
||||||
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||||
[[unroll]] for (int l = 0; l < 4; ++l) {
|
[[unroll]] for (int l = 0; l < 4; ++l) {
|
||||||
sum += FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32)
|
sum = fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32)
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32),
|
||||||
+ FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32);
|
fma(FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d, FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32), sum))));
|
||||||
}
|
}
|
||||||
tmp[16 * ix + tid] += sum;
|
tmp[16 * ix + tid] += sum;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -326,10 +326,10 @@ void main() {
|
||||||
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
|
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
|
||||||
}
|
}
|
||||||
const float d = loadd.x * sc;
|
const float d = loadd.x * sc;
|
||||||
const float m = loadd.y * mbyte;
|
const float m = -loadd.y * mbyte;
|
||||||
|
|
||||||
buf_a[buf_idx ] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) - m);
|
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m));
|
||||||
buf_a[buf_idx + 1] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) - m);
|
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m));
|
||||||
#elif defined(DATA_A_Q5_K)
|
#elif defined(DATA_A_Q5_K)
|
||||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||||
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
|
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
|
||||||
|
@ -357,10 +357,10 @@ void main() {
|
||||||
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
|
mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4));
|
||||||
}
|
}
|
||||||
const float d = loadd.x * sc;
|
const float d = loadd.x * sc;
|
||||||
const float m = loadd.y * mbyte;
|
const float m = -loadd.y * mbyte;
|
||||||
|
|
||||||
buf_a[buf_idx ] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0)) - m);
|
buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m));
|
||||||
buf_a[buf_idx + 1] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0)) - m);
|
buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m));
|
||||||
#elif defined(DATA_A_Q6_K)
|
#elif defined(DATA_A_Q6_K)
|
||||||
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
||||||
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
|
const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A;
|
||||||
|
@ -463,7 +463,8 @@ void main() {
|
||||||
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
|
||||||
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
|
||||||
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
[[unroll]] for (uint cr = 0; cr < TM; cr++) {
|
||||||
sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr] += float(cache_a[wsir * TM + cr]) * float(cache_b[wsic * TN + cc]);
|
const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr;
|
||||||
|
sums[sums_idx] = fma(float(cache_a[wsir * TM + cr]), float(cache_b[wsic * TN + cc]), sums[sums_idx]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
24
ggml/src/vulkan-shaders/repeat.comp
Normal file
24
ggml/src/vulkan-shaders/repeat.comp
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
uint src0_idx_mod(uint idx) {
|
||||||
|
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
|
||||||
|
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
|
||||||
|
const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
|
||||||
|
const uint i12_offset = i12*p.ne11*p.ne10;
|
||||||
|
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
|
||||||
|
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
|
||||||
|
return (i13 % p.ne03)*p.nb03 + (i12 % p.ne02)*p.nb02 + (i11 % p.ne01)*p.nb01 + (i10 % p.ne00)*p.nb00;
|
||||||
|
}
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx_mod(idx)]);
|
||||||
|
}
|
|
@ -384,6 +384,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||||
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
}));
|
||||||
|
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
|
File diff suppressed because it is too large
Load diff
237
gguf-py/tests/test_quants.py
Executable file
237
gguf-py/tests/test_quants.py
Executable file
|
@ -0,0 +1,237 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Test gguf.quants so that it exactly matches the C implementation of the (de)quantization
|
||||||
|
|
||||||
|
# NOTE: this is kind of a mess, but at least it worked for initially testing the Python implementations.
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from math import prod
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
import ctypes
|
||||||
|
import logging
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Necessary to load the local gguf package
|
||||||
|
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
import gguf
|
||||||
|
from gguf.constants import GGMLQuantizationType
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger("test-quants")
|
||||||
|
|
||||||
|
|
||||||
|
c_float_p = ctypes.POINTER(ctypes.c_float)
|
||||||
|
|
||||||
|
|
||||||
|
class ggml_init_params(ctypes.Structure):
|
||||||
|
_fields_ = [
|
||||||
|
("mem_size", ctypes.c_size_t),
|
||||||
|
("mem_buffer", ctypes.c_void_p),
|
||||||
|
("no_alloc", ctypes.c_bool),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class GGMLQuants:
|
||||||
|
libggml: ctypes.CDLL
|
||||||
|
|
||||||
|
def __init__(self, libggml: Path):
|
||||||
|
self.libggml = ctypes.CDLL(str(libggml))
|
||||||
|
self.libggml.ggml_quantize_chunk.restype = ctypes.c_size_t
|
||||||
|
# enum ggml_type type,
|
||||||
|
# const float * src,
|
||||||
|
# void * dst,
|
||||||
|
# int64_t start,
|
||||||
|
# int64_t nrows,
|
||||||
|
# int64_t n_per_row,
|
||||||
|
# const float * imatrix) {
|
||||||
|
self.libggml.ggml_quantize_chunk.argtypes = (
|
||||||
|
ctypes.c_int,
|
||||||
|
ctypes.POINTER(ctypes.c_float),
|
||||||
|
ctypes.c_void_p,
|
||||||
|
ctypes.c_int64,
|
||||||
|
ctypes.c_int64,
|
||||||
|
ctypes.c_int64,
|
||||||
|
ctypes.POINTER(ctypes.c_float),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.libggml.ggml_quantize_requires_imatrix.restype = ctypes.c_bool
|
||||||
|
self.libggml.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
|
||||||
|
|
||||||
|
for t in (
|
||||||
|
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
|
||||||
|
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
||||||
|
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
||||||
|
"iq4_nl", "iq4_xs",
|
||||||
|
):
|
||||||
|
dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + t)
|
||||||
|
dequant_func.restype = None
|
||||||
|
dequant_func.argtypes = (ctypes.c_void_p, ctypes.POINTER(ctypes.c_float), ctypes.c_int64)
|
||||||
|
|
||||||
|
self.libggml.ggml_fp16_to_fp32_row.restype = None
|
||||||
|
self.libggml.ggml_fp16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64)
|
||||||
|
self.libggml.ggml_bf16_to_fp32_row.restype = None
|
||||||
|
self.libggml.ggml_bf16_to_fp32_row.argtypes = (ctypes.POINTER(ctypes.c_uint16), ctypes.POINTER(ctypes.c_float), ctypes.c_int64)
|
||||||
|
|
||||||
|
self.libggml.ggml_init.argtypes = (ggml_init_params,)
|
||||||
|
|
||||||
|
self.libggml.ggml_init(ggml_init_params(1 * 1024 * 1024, 0, False))
|
||||||
|
|
||||||
|
def dequantize(self, tensor: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||||
|
result = np.zeros(gguf.quant_shape_from_byte_shape(tensor.shape, qtype), dtype=np.float32, order="C")
|
||||||
|
if qtype == GGMLQuantizationType.F32:
|
||||||
|
# no-op
|
||||||
|
result = tensor.view(np.float32)
|
||||||
|
elif qtype == GGMLQuantizationType.F16:
|
||||||
|
self.libggml.ggml_fp16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size)
|
||||||
|
elif qtype == GGMLQuantizationType.BF16:
|
||||||
|
self.libggml.ggml_bf16_to_fp32_row(tensor.ctypes.data_as(ctypes.POINTER(ctypes.c_uint16)), result.ctypes.data_as(c_float_p), result.size)
|
||||||
|
else:
|
||||||
|
lw_qname = qtype.name.lower()
|
||||||
|
if lw_qname[-1] == "k":
|
||||||
|
lw_qname = lw_qname[:-1] + "K"
|
||||||
|
dequant_func: ctypes._NamedFuncPointer = getattr(self.libggml, "dequantize_row_" + lw_qname)
|
||||||
|
dequant_func(tensor.ctypes.data_as(ctypes.c_void_p), result.ctypes.data_as(c_float_p), result.size)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def quantize(self, data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
|
||||||
|
result = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C")
|
||||||
|
if self.libggml.ggml_quantize_requires_imatrix(qtype.value):
|
||||||
|
# TODO: is a column-wise sum of squares appropriate?
|
||||||
|
qw = np.sum((data * data).reshape((-1, data.shape[-1])), axis=0).ctypes.data_as(c_float_p)
|
||||||
|
else:
|
||||||
|
qw = ctypes.cast(0, c_float_p)
|
||||||
|
result_size = self.libggml.ggml_quantize_chunk(qtype.value, data.ctypes.data_as(c_float_p), result.ctypes.data_as(ctypes.c_void_p), 0, prod(data.shape[:-1]), data.shape[-1], qw)
|
||||||
|
assert result.size == result_size
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType) -> bool:
|
||||||
|
same = np.array_equal(t1, t2)
|
||||||
|
if same:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
block_size, type_size = gguf.GGML_QUANT_SIZES[qtype]
|
||||||
|
if t1.dtype == np.float32:
|
||||||
|
t1 = t1.reshape((-1, block_size))
|
||||||
|
t2 = t2.reshape((-1, block_size))
|
||||||
|
else:
|
||||||
|
t1 = t1.reshape((-1, type_size))
|
||||||
|
t2 = t2.reshape((-1, type_size))
|
||||||
|
x = t1.view(np.uint8) ^ t2.view(np.uint8)
|
||||||
|
diff_bits = np.count_nonzero(np.unpackbits(x, axis=-1), axis=-1)
|
||||||
|
num_bad_blocks = np.count_nonzero(diff_bits, axis=0)
|
||||||
|
if num_bad_blocks == 0 and t1.shape == t2.shape:
|
||||||
|
logger.debug("Bits are equal, but arrays don't match, likely contains NANs")
|
||||||
|
return True
|
||||||
|
logger.debug(f"{num_bad_blocks} bad blocks ({100 * num_bad_blocks / x.shape[0]:.6f}%)")
|
||||||
|
bad_block_id = np.argmax(diff_bits, axis=0)
|
||||||
|
logger.debug(f"Worst block id: {bad_block_id}")
|
||||||
|
logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}")
|
||||||
|
|
||||||
|
sum_diff_bits = np.sum(diff_bits)
|
||||||
|
logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits/(x.size * 8):.6f}%)")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def do_test(libggml_path: Path, quick: bool = False):
|
||||||
|
ggml_quants = GGMLQuants(libggml_path)
|
||||||
|
|
||||||
|
np.set_printoptions(precision=None, threshold=(4 * 256) + 1, formatter={"int": lambda n: "0x%02X" % n})
|
||||||
|
|
||||||
|
r = np.random.randn(8, 1024, 1024).astype(np.float32, copy=False)
|
||||||
|
|
||||||
|
for qtype in (GGMLQuantizationType.F16, *gguf.quants._type_traits.keys()):
|
||||||
|
has_dequantize = False
|
||||||
|
has_quantize = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
gguf.dequantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][1]), dtype=np.uint8), qtype)
|
||||||
|
has_dequantize = True
|
||||||
|
except (NotImplementedError, AssertionError) as e:
|
||||||
|
if isinstance(e, AssertionError):
|
||||||
|
logger.error(f"Error with {qtype.name}: {e}")
|
||||||
|
raise e
|
||||||
|
try:
|
||||||
|
gguf.quantize(np.zeros((gguf.GGML_QUANT_SIZES[qtype][0]), dtype=np.float32), qtype)
|
||||||
|
has_quantize = True
|
||||||
|
except (NotImplementedError, AssertionError) as e:
|
||||||
|
if isinstance(e, AssertionError):
|
||||||
|
logger.error(f"Error with {qtype.name}: {e}")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
if not has_dequantize and not has_quantize:
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Testing {qtype.name}")
|
||||||
|
|
||||||
|
rc = r.copy(order="C")
|
||||||
|
|
||||||
|
pyq = None
|
||||||
|
ggq = None
|
||||||
|
|
||||||
|
if has_quantize:
|
||||||
|
logger.debug(f"Quantizing to {qtype.name} with Python")
|
||||||
|
pyq = gguf.quants.quantize(rc, qtype)
|
||||||
|
|
||||||
|
logger.debug(f"Quantizing to {qtype.name} with C")
|
||||||
|
ggq = ggml_quants.quantize(rc, qtype)
|
||||||
|
|
||||||
|
if qtype == GGMLQuantizationType.F16:
|
||||||
|
pyq = pyq.view(np.uint8)
|
||||||
|
quant_equal = compare_tensors(pyq, ggq, qtype)
|
||||||
|
|
||||||
|
if not quant_equal:
|
||||||
|
logger.error(f"Quantization to {qtype.name} does not match ❌")
|
||||||
|
else:
|
||||||
|
logger.info(f"Quantization to {qtype.name} matches exactly ✅")
|
||||||
|
|
||||||
|
if has_dequantize:
|
||||||
|
if ggq is None and not quick:
|
||||||
|
logger.debug(f"Quantizing to {qtype.name} with C")
|
||||||
|
ggq = ggml_quants.quantize(rc, qtype)
|
||||||
|
|
||||||
|
if ggq is not None:
|
||||||
|
logger.debug(f"Dequantizing from {qtype.name} with Python")
|
||||||
|
pydq = gguf.quants.dequantize(ggq, qtype)
|
||||||
|
logger.debug(f"Dequantizing from {qtype.name} with C")
|
||||||
|
ggdq = ggml_quants.dequantize(ggq, qtype)
|
||||||
|
|
||||||
|
dequant_equal = compare_tensors(pydq, ggdq, qtype)
|
||||||
|
|
||||||
|
if not dequant_equal:
|
||||||
|
logger.error(f"Dequantization from {qtype.name} does not match ❌")
|
||||||
|
else:
|
||||||
|
logger.info(f"Dequantization from {qtype.name} matches exactly ✅")
|
||||||
|
|
||||||
|
rq_shape = gguf.quants.quant_shape_to_byte_shape((8, 1024, 1024 // 2), qtype)
|
||||||
|
rq = np.random.random(rq_shape).astype(np.float16).view(np.uint8)
|
||||||
|
|
||||||
|
logger.debug(f"Dequantizing random f16 data as {qtype.name} with Python")
|
||||||
|
pydq = gguf.quants.dequantize(rq, qtype)
|
||||||
|
logger.debug(f"Dequantizing random f16 data as {qtype.name} with C")
|
||||||
|
ggdq = ggml_quants.dequantize(rq, qtype)
|
||||||
|
|
||||||
|
dequant_equal = compare_tensors(pydq, ggdq, qtype)
|
||||||
|
|
||||||
|
if not dequant_equal:
|
||||||
|
logger.error(f"Dequantization from random f16 data as {qtype.name} does not match ❌")
|
||||||
|
else:
|
||||||
|
logger.info(f"Dequantization from random f16 data as {qtype.name} matches exactly ✅")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Test Python (de)quantization against the reference C implementation")
|
||||||
|
parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "ggml" / "src" / "libggml.so", help="The path to libggml.so")
|
||||||
|
parser.add_argument("--quick", action="store_true", help="Don't quantize with C when it's not strictly necessary")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
|
do_test(args.libggml, args.quick)
|
|
@ -93,15 +93,14 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
|
||||||
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
LLAMA_ROPE_TYPE_NONE = -1,
|
LLAMA_ROPE_TYPE_NONE = -1,
|
||||||
LLAMA_ROPE_TYPE_NORM = 0,
|
LLAMA_ROPE_TYPE_NORM = 0,
|
||||||
LLAMA_ROPE_TYPE_NEOX = 2,
|
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
||||||
LLAMA_ROPE_TYPE_GLM = 4,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
||||||
|
@ -915,11 +914,8 @@ extern "C" {
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
||||||
|
|
||||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
||||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
|
||||||
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
|
||||||
|
|
||||||
// Codellama infill tokens
|
// Codellama infill tokens
|
||||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||||
|
|
|
@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
|
||||||
constexpr float bucket_low = -10.0f;
|
constexpr float bucket_low = -10.0f;
|
||||||
constexpr float bucket_high = 10.0f;
|
constexpr float bucket_high = 10.0f;
|
||||||
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
|
||||||
constexpr float bucker_inter = -bucket_low * bucket_scale;
|
constexpr float bucket_inter = -bucket_low * bucket_scale;
|
||||||
|
|
||||||
std::vector<int> bucket_idx(candidates->size);
|
std::vector<int> bucket_idx(candidates->size);
|
||||||
std::vector<int> histo(nbuckets, 0);
|
std::vector<int> histo(nbuckets, 0);
|
||||||
|
|
||||||
for (int i = 0; i < (int)candidates->size; ++i) {
|
for (int i = 0; i < (int)candidates->size; ++i) {
|
||||||
const float val = candidates->data[i].logit;
|
const float val = candidates->data[i].logit;
|
||||||
int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||||
ib = std::max(0, std::min(nbuckets-1, ib));
|
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||||
bucket_idx[i] = ib;
|
bucket_idx[i] = ib;
|
||||||
++histo[ib];
|
++histo[ib];
|
||||||
|
|
|
@ -410,6 +410,8 @@ struct llm_tokenizer_bpe {
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_BLOOM:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||||
};
|
};
|
||||||
|
@ -1466,11 +1468,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_pad_id;
|
return vocab.special_pad_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) {
|
bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.tokenizer_add_bos;
|
return vocab.tokenizer_add_bos;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.tokenizer_add_eos;
|
return vocab.tokenizer_add_eos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -95,8 +95,8 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
||||||
|
|
||||||
int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
||||||
int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
||||||
|
|
||||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||||
|
|
|
@ -3575,13 +3575,8 @@ namespace GGUFMeta {
|
||||||
|
|
||||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||||
|
|
||||||
// TODO: update when needed or think of some clever automatic way to do this
|
static size_t llama_model_max_nodes(const llama_model & model) {
|
||||||
static size_t llama_model_max_nodes(const llama_model & /*model*/) {
|
return std::max<size_t>(8192, model.tensors_by_name.size()*5);
|
||||||
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
|
|
||||||
// return 32768;
|
|
||||||
//}
|
|
||||||
|
|
||||||
return 8192;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model_loader {
|
struct llama_model_loader {
|
||||||
|
@ -5472,6 +5467,12 @@ static void llm_load_vocab(
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "codeshell") {
|
tokenizer_pre == "codeshell") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "bloom") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "gpt3-finnish") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -18704,11 +18705,11 @@ llama_token llama_token_pad(const struct llama_model * model) {
|
||||||
return llama_token_pad_impl(model->vocab);
|
return llama_token_pad_impl(model->vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_add_bos_token(const struct llama_model * model) {
|
bool llama_add_bos_token(const struct llama_model * model) {
|
||||||
return llama_add_bos_token_impl(model->vocab);
|
return llama_add_bos_token_impl(model->vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_add_eos_token(const struct llama_model * model) {
|
bool llama_add_eos_token(const struct llama_model * model) {
|
||||||
return llama_add_eos_token_impl(model->vocab);
|
return llama_add_eos_token_impl(model->vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue