Merge remote-tracking branch 'upstream/master' into t5-clean-3
This commit is contained in:
commit
b01ce7dfea
70 changed files with 2639 additions and 1698 deletions
|
@ -17,19 +17,18 @@
|
||||||
rocmPackages,
|
rocmPackages,
|
||||||
vulkan-headers,
|
vulkan-headers,
|
||||||
vulkan-loader,
|
vulkan-loader,
|
||||||
clblast,
|
curl,
|
||||||
useBlas ? builtins.all (x: !x) [
|
useBlas ? builtins.all (x: !x) [
|
||||||
useCuda
|
useCuda
|
||||||
useMetalKit
|
useMetalKit
|
||||||
useOpenCL
|
|
||||||
useRocm
|
useRocm
|
||||||
useVulkan
|
useVulkan
|
||||||
] && blas.meta.available,
|
] && blas.meta.available,
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
||||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||||
useOpenCL ? false,
|
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
|
enableCurl ? true,
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||||
|
|
||||||
|
@ -56,7 +55,6 @@ let
|
||||||
++ lib.optionals useCuda [ "CUDA" ]
|
++ lib.optionals useCuda [ "CUDA" ]
|
||||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||||
++ lib.optionals useMpi [ "MPI" ]
|
++ lib.optionals useMpi [ "MPI" ]
|
||||||
++ lib.optionals useOpenCL [ "OpenCL" ]
|
|
||||||
++ lib.optionals useRocm [ "ROCm" ]
|
++ lib.optionals useRocm [ "ROCm" ]
|
||||||
++ lib.optionals useVulkan [ "Vulkan" ];
|
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||||
|
|
||||||
|
@ -198,19 +196,19 @@ effectiveStdenv.mkDerivation (
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||||
++ optionals useCuda cudaBuildInputs
|
++ optionals useCuda cudaBuildInputs
|
||||||
++ optionals useMpi [ mpi ]
|
++ optionals useMpi [ mpi ]
|
||||||
++ optionals useOpenCL [ clblast ]
|
|
||||||
++ optionals useRocm rocmBuildInputs
|
++ optionals useRocm rocmBuildInputs
|
||||||
++ optionals useBlas [ blas ]
|
++ optionals useBlas [ blas ]
|
||||||
++ optionals useVulkan vulkanBuildInputs;
|
++ optionals useVulkan vulkanBuildInputs
|
||||||
|
++ optionals enableCurl [ curl ];
|
||||||
|
|
||||||
cmakeFlags =
|
cmakeFlags =
|
||||||
[
|
[
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
|
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
(cmakeBool "GGML_CLBLAST" useOpenCL)
|
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
|
@ -254,7 +252,6 @@ effectiveStdenv.mkDerivation (
|
||||||
useCuda
|
useCuda
|
||||||
useMetalKit
|
useMetalKit
|
||||||
useMpi
|
useMpi
|
||||||
useOpenCL
|
|
||||||
useRocm
|
useRocm
|
||||||
useVulkan
|
useVulkan
|
||||||
;
|
;
|
||||||
|
@ -281,7 +278,7 @@ effectiveStdenv.mkDerivation (
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
# cudaPackages would've refused to evaluate anyway.
|
||||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
badPlatforms = optionals useCuda lib.platforms.darwin;
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
# Configurations that are known to result in build failures. Can be
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||||
|
|
2
.github/ISSUE_TEMPLATE/config.yml
vendored
2
.github/ISSUE_TEMPLATE/config.yml
vendored
|
@ -9,5 +9,3 @@ contact_links:
|
||||||
- name: Want to contribute?
|
- name: Want to contribute?
|
||||||
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
||||||
about: Head to the contribution guide page of the wiki for areas you can help with
|
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||||
|
|
||||||
|
|
||||||
|
|
7
.github/workflows/build.yml
vendored
7
.github/workflows/build.yml
vendored
|
@ -47,7 +47,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -105,7 +105,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
|
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -222,7 +222,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -799,6 +799,7 @@ jobs:
|
||||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||||
cd build
|
cd build
|
||||||
|
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
|
||||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
|
|
|
@ -79,14 +79,21 @@ set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
||||||
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
||||||
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
|
||||||
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||||
set(GGML_LLAMAFILE ON)
|
|
||||||
set(GGML_CUDA_USE_GRAPHS ON)
|
# change the default for these ggml options
|
||||||
|
if (NOT DEFINED GGML_LLAMAFILE)
|
||||||
|
set(GGML_LLAMAFILE ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
|
||||||
|
set(GGML_CUDA_USE_GRAPHS ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
# transition helpers
|
# transition helpers
|
||||||
function (llama_option_depr TYPE OLD NEW)
|
function (llama_option_depr TYPE OLD NEW)
|
||||||
if (${OLD})
|
if (${OLD})
|
||||||
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
|
||||||
set(${NEW} ON)
|
set(${NEW} ON PARENT_SCOPE)
|
||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
|
@ -96,7 +103,6 @@ llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
|
||||||
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
|
||||||
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
|
||||||
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
|
||||||
llama_option_depr(WARNING LLAMA_OPENMP GGML_OPENMP)
|
|
||||||
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
|
||||||
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
|
||||||
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
"CMAKE_CXX_COMPILER": "icx",
|
"CMAKE_CXX_COMPILER": "icx",
|
||||||
|
"CMAKE_C_COMPILER": "cl",
|
||||||
"GGML_SYCL": "ON",
|
"GGML_SYCL": "ON",
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
}
|
}
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -45,6 +45,7 @@ BUILD_TARGETS = \
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
tests/test-autorelease \
|
tests/test-autorelease \
|
||||||
tests/test-backend-ops \
|
tests/test-backend-ops \
|
||||||
|
tests/test-chat-template \
|
||||||
tests/test-double-float \
|
tests/test-double-float \
|
||||||
tests/test-grad0 \
|
tests/test-grad0 \
|
||||||
tests/test-grammar-integration \
|
tests/test-grammar-integration \
|
||||||
|
@ -61,6 +62,11 @@ TEST_TARGETS = \
|
||||||
tests/test-tokenizer-1-bpe \
|
tests/test-tokenizer-1-bpe \
|
||||||
tests/test-tokenizer-1-spm
|
tests/test-tokenizer-1-spm
|
||||||
|
|
||||||
|
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||||
|
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
|
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||||
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
|
||||||
|
|
||||||
# Deprecation aliases
|
# Deprecation aliases
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
|
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
|
||||||
|
@ -1070,6 +1076,7 @@ clean:
|
||||||
rm -rvf src/*.o
|
rm -rvf src/*.o
|
||||||
rm -rvf tests/*.o
|
rm -rvf tests/*.o
|
||||||
rm -rvf examples/*.o
|
rm -rvf examples/*.o
|
||||||
|
rm -rvf common/*.o
|
||||||
rm -rvf *.a
|
rm -rvf *.a
|
||||||
rm -rvf *.dll
|
rm -rvf *.dll
|
||||||
rm -rvf *.so
|
rm -rvf *.so
|
||||||
|
@ -1084,6 +1091,7 @@ clean:
|
||||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
||||||
rm -rvf $(BUILD_TARGETS)
|
rm -rvf $(BUILD_TARGETS)
|
||||||
rm -rvf $(TEST_TARGETS)
|
rm -rvf $(TEST_TARGETS)
|
||||||
|
rm -rvf $(LEGACY_TARGETS)
|
||||||
find examples pocs -type f -name "*.o" -delete
|
find examples pocs -type f -name "*.o" -delete
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -108,6 +108,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
|
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
|
||||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
|
||||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||||
|
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
|
||||||
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||||
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
|
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
|
||||||
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
|
||||||
|
@ -217,6 +218,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
**Tools:**
|
**Tools:**
|
||||||
|
|
||||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
|
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||||
|
|
||||||
|
**Infrastructure:**
|
||||||
|
|
||||||
|
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
|
@ -1014,16 +1014,23 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
}
|
}
|
||||||
if (arg == "--in-prefix-bos") {
|
if (arg == "--in-prefix-bos") {
|
||||||
params.input_prefix_bos = true;
|
params.input_prefix_bos = true;
|
||||||
|
params.enable_chat_template = false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--in-prefix") {
|
if (arg == "--in-prefix") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.input_prefix = argv[i];
|
params.input_prefix = argv[i];
|
||||||
|
params.enable_chat_template = false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--in-suffix") {
|
if (arg == "--in-suffix") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.input_suffix = argv[i];
|
params.input_suffix = argv[i];
|
||||||
|
params.enable_chat_template = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--spm-infill") {
|
||||||
|
params.spm_infill = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--grammar") {
|
if (arg == "--grammar") {
|
||||||
|
@ -1402,13 +1409,15 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
"halt generation at PROMPT, return control in interactive mode\n"
|
"halt generation at PROMPT, return control in interactive mode\n"
|
||||||
"can be specified more than once for multiple prompts" });
|
"can be specified more than once for multiple prompts" });
|
||||||
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
||||||
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
|
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" });
|
||||||
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
||||||
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
||||||
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
||||||
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
||||||
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
||||||
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
||||||
|
options.push_back({ "server infill",
|
||||||
|
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
||||||
|
|
||||||
options.push_back({ "sampling" });
|
options.push_back({ "sampling" });
|
||||||
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
||||||
|
@ -2635,6 +2644,7 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
const std::vector<llama_chat_msg> & msgs,
|
const std::vector<llama_chat_msg> & msgs,
|
||||||
bool add_ass) {
|
bool add_ass) {
|
||||||
int alloc_size = 0;
|
int alloc_size = 0;
|
||||||
|
bool fallback = false; // indicate if we must fallback to default chatml
|
||||||
std::vector<llama_chat_message> chat;
|
std::vector<llama_chat_message> chat;
|
||||||
for (auto & msg : msgs) {
|
for (auto & msg : msgs) {
|
||||||
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
||||||
|
@ -2647,10 +2657,26 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
// run the first time to get the total output length
|
// run the first time to get the total output length
|
||||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||||
|
|
||||||
|
// error: chat template is not supported
|
||||||
|
if (res < 0) {
|
||||||
|
if (ptr_tmpl != nullptr) {
|
||||||
|
// if the custom "tmpl" is not supported, we throw an error
|
||||||
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
||||||
|
throw std::runtime_error("this custom template is not supported");
|
||||||
|
} else {
|
||||||
|
// If the built-in template is not supported, we default to chatml
|
||||||
|
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||||
|
fallback = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// if it turns out that our buffer is too small, we resize it
|
// if it turns out that our buffer is too small, we resize it
|
||||||
if ((size_t) res > buf.size()) {
|
if ((size_t) res > buf.size()) {
|
||||||
buf.resize(res);
|
buf.resize(res);
|
||||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
res = llama_chat_apply_template(
|
||||||
|
fallback ? nullptr : model,
|
||||||
|
fallback ? "chatml" : ptr_tmpl,
|
||||||
|
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string formatted_chat(buf.data(), res);
|
std::string formatted_chat(buf.data(), res);
|
||||||
|
@ -2662,12 +2688,19 @@ std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
const std::vector<llama_chat_msg> & past_msg,
|
const std::vector<llama_chat_msg> & past_msg,
|
||||||
const llama_chat_msg & new_msg,
|
const llama_chat_msg & new_msg,
|
||||||
bool add_ass) {
|
bool add_ass) {
|
||||||
|
std::ostringstream ss;
|
||||||
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
|
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
|
||||||
std::vector<llama_chat_msg> chat_new(past_msg);
|
std::vector<llama_chat_msg> chat_new(past_msg);
|
||||||
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
||||||
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
||||||
|
ss << "\n";
|
||||||
|
};
|
||||||
|
// format chat with new_msg
|
||||||
chat_new.push_back(new_msg);
|
chat_new.push_back(new_msg);
|
||||||
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
|
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
|
||||||
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
// get the diff part
|
||||||
return formatted;
|
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
||||||
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_chat_format_example(const struct llama_model * model,
|
std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
|
@ -2821,125 +2854,87 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
||||||
//
|
//
|
||||||
|
|
||||||
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
|
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
|
||||||
int32_t n_tensors;
|
|
||||||
|
|
||||||
size_t n_bytes = 0;
|
|
||||||
|
|
||||||
uint32_t max_direction_layer = 0;
|
|
||||||
|
|
||||||
llama_control_vector_data result = { -1, {} };
|
llama_control_vector_data result = { -1, {} };
|
||||||
|
|
||||||
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
|
ggml_context * ctx = nullptr;
|
||||||
{
|
struct gguf_init_params meta_gguf_params = {
|
||||||
struct ggml_init_params meta_params = {
|
/* .no_alloc = */ false,
|
||||||
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
|
/* .ctx = */ &ctx,
|
||||||
/* .mem_buffer = */ nullptr,
|
};
|
||||||
/* .no_alloc = */ true,
|
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
||||||
};
|
if (!ctx_gguf) {
|
||||||
ggml_context * meta_ctx = ggml_init(meta_params);
|
fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
||||||
struct gguf_init_params meta_gguf_params = {
|
return result;
|
||||||
/* .no_alloc = */ true,
|
|
||||||
/* .ctx = */ &meta_ctx,
|
|
||||||
};
|
|
||||||
struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
|
||||||
if (!meta_ctx_gguf) {
|
|
||||||
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
|
|
||||||
ggml_free(meta_ctx);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
n_tensors = gguf_get_n_tensors(meta_ctx_gguf);
|
|
||||||
for (int i = 0; i < n_tensors; i++) {
|
|
||||||
std::string name = gguf_get_tensor_name(meta_ctx_gguf, i);
|
|
||||||
|
|
||||||
// split on '.'
|
|
||||||
size_t dotpos = name.find('.');
|
|
||||||
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
|
||||||
try {
|
|
||||||
uint32_t layer = std::stoi(name.substr(dotpos + 1));
|
|
||||||
if (layer == 0) {
|
|
||||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
|
||||||
ggml_free(meta_ctx);
|
|
||||||
gguf_free(meta_ctx_gguf);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
if (layer > max_direction_layer) {
|
|
||||||
max_direction_layer = layer;
|
|
||||||
}
|
|
||||||
} catch (...) {
|
|
||||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
|
||||||
ggml_free(meta_ctx);
|
|
||||||
gguf_free(meta_ctx_gguf);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
|
|
||||||
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
|
|
||||||
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
|
|
||||||
ggml_free(meta_ctx);
|
|
||||||
gguf_free(meta_ctx_gguf);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
if (result.n_embd == -1) {
|
|
||||||
result.n_embd = ggml_nelements(tensor_meta);
|
|
||||||
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
|
|
||||||
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
|
|
||||||
ggml_free(meta_ctx);
|
|
||||||
gguf_free(meta_ctx_gguf);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
n_bytes += ggml_nbytes(tensor_meta);
|
|
||||||
}
|
|
||||||
ggml_free(meta_ctx);
|
|
||||||
gguf_free(meta_ctx_gguf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||||
if (n_tensors == 0) {
|
if (n_tensors == 0) {
|
||||||
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// load and scale tensors into final control vector context
|
for (int i = 0; i < n_tensors; i++) {
|
||||||
struct ggml_init_params ggml_params = {
|
std::string name = gguf_get_tensor_name(ctx_gguf, i);
|
||||||
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
|
|
||||||
/* .mem_buffer = */ nullptr,
|
|
||||||
/* .no_alloc = */ false,
|
|
||||||
};
|
|
||||||
struct ggml_context * ctx = ggml_init(ggml_params);
|
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
int layer_idx = -1;
|
||||||
/*.no_alloc = */ false,
|
|
||||||
/*.ctx = */ &ctx,
|
|
||||||
};
|
|
||||||
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params);
|
|
||||||
if (!ctx_gguf) {
|
|
||||||
fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str());
|
|
||||||
ggml_free(ctx);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// do not store data for layer 0 (it's not used)
|
// split on '.'
|
||||||
result.data.resize(result.n_embd * max_direction_layer);
|
size_t dotpos = name.find('.');
|
||||||
|
if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") {
|
||||||
for (uint32_t il = 1; il <= max_direction_layer; il++) {
|
try {
|
||||||
const std::string name = "direction." + std::to_string(il);
|
layer_idx = std::stoi(name.substr(dotpos + 1));
|
||||||
const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
} catch (...) {
|
||||||
|
layer_idx = -1;
|
||||||
float * dst = result.data.data() + result.n_embd * (il - 1);
|
|
||||||
|
|
||||||
if (tensor) {
|
|
||||||
const float * src = (const float *) tensor->data;
|
|
||||||
for (int j = 0; j < result.n_embd; j++) {
|
|
||||||
dst[j] = src[j] * load_info.strength;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int j = 0; j < result.n_embd; j++) {
|
|
||||||
dst[j] = 0.0f;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (layer_idx < 0) {
|
||||||
|
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
||||||
|
result.n_embd = -1;
|
||||||
|
break;
|
||||||
|
} else if (layer_idx == 0) {
|
||||||
|
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
||||||
|
result.n_embd = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
||||||
|
if (tensor->type != GGML_TYPE_F32) {
|
||||||
|
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
||||||
|
result.n_embd = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (ggml_n_dims(tensor) != 1) {
|
||||||
|
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
||||||
|
result.n_embd = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.n_embd == -1) {
|
||||||
|
result.n_embd = ggml_nelements(tensor);
|
||||||
|
} else if (ggml_nelements(tensor) != result.n_embd) {
|
||||||
|
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
||||||
|
result.n_embd = -1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// extend if necessary - do not store data for layer 0 (it's not used)
|
||||||
|
result.data.resize(std::max(result.data.size(), static_cast<size_t>(result.n_embd * layer_idx)), 0.0f);
|
||||||
|
|
||||||
|
const float * src = (const float *) tensor->data;
|
||||||
|
float * dst = result.data.data() + result.n_embd * (layer_idx - 1); // layer 1 at [0]
|
||||||
|
for (int j = 0; j < result.n_embd; j++) {
|
||||||
|
dst[j] += src[j] * load_info.strength; // allows multiple directions for same layer in same file
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (result.n_embd == -1) {
|
||||||
|
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
||||||
|
result.data.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
ggml_free(ctx);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2950,16 +2945,19 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
||||||
auto cur = llama_control_vector_load_one(info);
|
auto cur = llama_control_vector_load_one(info);
|
||||||
|
|
||||||
if (cur.n_embd == -1) {
|
if (cur.n_embd == -1) {
|
||||||
return result;
|
result.n_embd = -1;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
|
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
||||||
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
|
fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
||||||
return result;
|
result.n_embd = -1;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result.n_embd == -1) {
|
if (result.n_embd == -1) {
|
||||||
result = std::move(cur);
|
result = std::move(cur);
|
||||||
} else {
|
} else {
|
||||||
|
result.data.resize(std::max(result.data.size(), cur.data.size()), 0.0f); // extend if necessary
|
||||||
for (size_t i = 0; i < cur.data.size(); i++) {
|
for (size_t i = 0; i < cur.data.size(); i++) {
|
||||||
result.data[i] += cur.data[i];
|
result.data[i] += cur.data[i];
|
||||||
}
|
}
|
||||||
|
@ -2967,7 +2965,8 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result.n_embd == -1) {
|
if (result.n_embd == -1) {
|
||||||
fprintf(stderr, "%s: no vectors passed\n", __func__);
|
fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
|
||||||
|
result.data.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -200,6 +200,7 @@ struct gpt_params {
|
||||||
std::string public_path = "";
|
std::string public_path = "";
|
||||||
std::string chat_template = "";
|
std::string chat_template = "";
|
||||||
std::string system_prompt = "";
|
std::string system_prompt = "";
|
||||||
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
|
@ -250,6 +251,8 @@ struct gpt_params {
|
||||||
std::string cvector_outfile = "control_vector.gguf";
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
|
|
||||||
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
|
@ -380,6 +383,8 @@ struct llama_chat_msg {
|
||||||
bool llama_chat_verify_template(const std::string & tmpl);
|
bool llama_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
// CPP wrapper for llama_chat_apply_template
|
// CPP wrapper for llama_chat_apply_template
|
||||||
|
// If the built-in template is not supported, we default to chatml
|
||||||
|
// If the custom "tmpl" is not supported, we throw an error
|
||||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<llama_chat_msg> & chat,
|
const std::vector<llama_chat_msg> & chat,
|
||||||
|
@ -454,4 +459,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
||||||
void yaml_dump_non_result_info(
|
void yaml_dump_non_result_info(
|
||||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
||||||
|
|
|
@ -316,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||||
|
|
||||||
template <typename Iterator>
|
template <typename Iterator>
|
||||||
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
std::string join(Iterator begin, Iterator end, const std::string & separator) {
|
||||||
|
@ -720,7 +720,7 @@ private:
|
||||||
}
|
}
|
||||||
prop_names.push_back(prop_name);
|
prop_names.push_back(prop_name);
|
||||||
}
|
}
|
||||||
if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
|
if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
|
||||||
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
||||||
std::string value_rule =
|
std::string value_rule =
|
||||||
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
|
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
|
||||||
|
|
|
@ -86,6 +86,8 @@ models = [
|
||||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
|
||||||
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||||
|
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
|
||||||
|
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -487,6 +487,12 @@ class Model:
|
||||||
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
||||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
||||||
res = "jina-v2-code"
|
res = "jina-v2-code"
|
||||||
|
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
||||||
|
# ref: https://huggingface.co/LumiOpen/Viking-7B
|
||||||
|
res = "viking"
|
||||||
|
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
||||||
|
# ref: https://huggingface.co/core42/jais-13b
|
||||||
|
res = "jais"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -573,7 +579,19 @@ class Model:
|
||||||
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_sentencepiece(self):
|
def _set_vocab_sentencepiece(self, add_to_gguf=True):
|
||||||
|
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("llama")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def _create_vocab_sentencepiece(self):
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||||
|
@ -635,14 +653,7 @@ class Model:
|
||||||
scores.append(-1000.0)
|
scores.append(-1000.0)
|
||||||
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||||
|
|
||||||
self.gguf_writer.add_tokenizer_model("llama")
|
return tokens, scores, toktypes
|
||||||
self.gguf_writer.add_tokenizer_pre("default")
|
|
||||||
self.gguf_writer.add_token_list(tokens)
|
|
||||||
self.gguf_writer.add_token_scores(scores)
|
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
|
||||||
|
|
||||||
def _set_vocab_llama_hf(self):
|
def _set_vocab_llama_hf(self):
|
||||||
vocab = gguf.LlamaHfVocab(self.dir_model)
|
vocab = gguf.LlamaHfVocab(self.dir_model)
|
||||||
|
@ -2337,6 +2348,70 @@ class GemmaModel(Model):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Gemma2ForCausalLM")
|
||||||
|
class Gemma2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.GEMMA2
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
tokens, scores, toktypes = self._create_vocab_sentencepiece()
|
||||||
|
# hack: This is required so that we can properly use start/end-of-turn for chat template
|
||||||
|
for i in range(108):
|
||||||
|
# including <unusedX>, <start_of_turn>, <end_of_turn>
|
||||||
|
toktypes[i] = SentencePieceTokenTypes.CONTROL
|
||||||
|
self.gguf_writer.add_tokenizer_model("llama")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
self.gguf_writer.add_add_space_prefix(False)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
hparams = self.hparams
|
||||||
|
block_count = hparams["num_hidden_layers"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
|
||||||
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
|
self.gguf_writer.add_key_length(hparams["head_dim"])
|
||||||
|
self.gguf_writer.add_value_length(hparams["head_dim"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
self.gguf_writer.add_attn_logit_softcapping(
|
||||||
|
self.hparams["attn_logit_softcapping"]
|
||||||
|
)
|
||||||
|
self.gguf_writer.add_final_logit_softcapping(
|
||||||
|
self.hparams["final_logit_softcapping"]
|
||||||
|
)
|
||||||
|
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
attn_scalar = self.hparams["query_pre_attn_scalar"]
|
||||||
|
if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
|
||||||
|
raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unusem
|
||||||
|
|
||||||
|
# lm_head is not used in llama.cpp, while autoawq will include this tensor in model
|
||||||
|
# To prevent errors, skip loading lm_head.weight.
|
||||||
|
if name == "lm_head.weight":
|
||||||
|
logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
|
||||||
|
if name.endswith("norm.weight"):
|
||||||
|
data_torch = data_torch + 1
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
@Model.register("Starcoder2ForCausalLM")
|
@Model.register("Starcoder2ForCausalLM")
|
||||||
class StarCoder2Model(Model):
|
class StarCoder2Model(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||||
|
@ -2919,6 +2994,96 @@ class T5Model(Model):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("JAISLMHeadModel")
|
||||||
|
class JaisModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.JAIS
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# SwigLU activation
|
||||||
|
assert self.hparams["activation_function"] == "swiglu"
|
||||||
|
# ALiBi position embedding
|
||||||
|
assert self.hparams["position_embedding_type"] == "alibi"
|
||||||
|
|
||||||
|
# Embeddings scale
|
||||||
|
self.embeddings_scale = 1.0
|
||||||
|
# note: For some JAIS flavors, output is tied to (same as) wte in original model
|
||||||
|
self.output_is_wte = False
|
||||||
|
if 'mup_embeddings_scale' in self.hparams:
|
||||||
|
self.output_is_wte = True # Hack (?)
|
||||||
|
self.embeddings_scale = self.hparams['mup_embeddings_scale']
|
||||||
|
elif 'embeddings_scale' in self.hparams:
|
||||||
|
self.embeddings_scale = self.hparams['embeddings_scale']
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
self.width_scale = 1.0
|
||||||
|
if 'mup_output_alpha' in self.hparams:
|
||||||
|
assert 'mup_width_scale' in self.hparams
|
||||||
|
self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
|
||||||
|
elif 'width_scale' in self.hparams:
|
||||||
|
self.width_scale = self.hparams['width_scale']
|
||||||
|
else:
|
||||||
|
assert False
|
||||||
|
|
||||||
|
self.max_alibi_bias = 8.0
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# we don't need these
|
||||||
|
if name.endswith((".attn.bias")):
|
||||||
|
return tensors
|
||||||
|
|
||||||
|
if name.endswith(("relative_pe.slopes")):
|
||||||
|
# Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
|
||||||
|
# Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
|
||||||
|
# but Jais's PyTorch model simply precalculates the slope values and places them
|
||||||
|
# in relative_pes.slopes
|
||||||
|
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
|
||||||
|
first_val = float(data_torch._data[0])
|
||||||
|
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
|
||||||
|
|
||||||
|
return tensors
|
||||||
|
|
||||||
|
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
|
||||||
|
data_torch = data_torch.transpose(1, 0)
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
||||||
|
tensors.append((new_name, data_torch * self.embeddings_scale))
|
||||||
|
if self.output_is_wte:
|
||||||
|
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
|
||||||
|
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
||||||
|
assert not self.output_is_wte
|
||||||
|
tensors.append((new_name, data_torch * self.width_scale))
|
||||||
|
else:
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
|
||||||
|
return tensors
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
super().write_tensors()
|
||||||
|
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -3074,7 +3239,8 @@ def main() -> None:
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
|
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
|
||||||
|
if args.use_temp_file and is_split:
|
||||||
logger.error("Error: Cannot use temp file when splitting")
|
logger.error("Error: Cannot use temp file when splitting")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -3111,11 +3277,12 @@ def main() -> None:
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
logger.info("Exporting model vocab...")
|
logger.info("Exporting model vocab...")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
logger.info("Model vocab successfully exported.")
|
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
|
||||||
else:
|
else:
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
logger.info("Model successfully exported.")
|
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
||||||
|
logger.info(f"Model successfully exported to {out_path}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -58,4 +58,3 @@ The above command will output space-separated float values.
|
||||||
```powershell
|
```powershell
|
||||||
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -15,6 +15,7 @@ In this section, we cover the most commonly used options for running the `infill
|
||||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||||
|
- `--spm-infill`: Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||||
|
|
||||||
## Input Prompts
|
## Input Prompts
|
||||||
|
|
||||||
|
|
|
@ -210,6 +210,7 @@ int main(int argc, char ** argv) {
|
||||||
suff_rm_leading_spc = false;
|
suff_rm_leading_spc = false;
|
||||||
}
|
}
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
std::vector<llama_token> embd_end;
|
||||||
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
||||||
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
||||||
const int space_token = 29871;
|
const int space_token = 29871;
|
||||||
|
@ -217,12 +218,13 @@ int main(int argc, char ** argv) {
|
||||||
inp_sfx.erase(inp_sfx.begin());
|
inp_sfx.erase(inp_sfx.begin());
|
||||||
}
|
}
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||||
if (add_bos) {
|
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
|
|
||||||
}
|
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||||
embd_inp = inp_pfx;
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||||
|
if (add_bos) {
|
||||||
|
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||||
|
}
|
||||||
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||||
|
|
||||||
const llama_token middle_token = llama_token_middle(model);
|
const llama_token middle_token = llama_token_middle(model);
|
||||||
if (middle_token >= 0) {
|
if (middle_token >= 0) {
|
||||||
|
@ -526,14 +528,14 @@ int main(int argc, char ** argv) {
|
||||||
inp_sfx.erase(inp_sfx.begin());
|
inp_sfx.erase(inp_sfx.begin());
|
||||||
}
|
}
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
||||||
if (add_bos) {
|
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
|
|
||||||
}
|
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
||||||
embd_inp = inp_pfx;
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||||
|
if (add_bos) {
|
||||||
|
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||||
|
}
|
||||||
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||||
|
|
||||||
const llama_token middle_token = llama_token_middle(model);
|
|
||||||
if (middle_token >= 0) {
|
if (middle_token >= 0) {
|
||||||
embd_inp.push_back(middle_token);
|
embd_inp.push_back(middle_token);
|
||||||
}
|
}
|
||||||
|
@ -657,4 +659,3 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -231,7 +231,7 @@ GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
|
||||||
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
|
GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}
|
||||||
|
|
||||||
NON_LITERAL_SET = set('|.()[]{}*+?')
|
NON_LITERAL_SET = set('|.()[]{}*+?')
|
||||||
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
|
ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('^$.[]()|{}*+?')
|
||||||
|
|
||||||
|
|
||||||
class SchemaConverter:
|
class SchemaConverter:
|
||||||
|
@ -602,7 +602,7 @@ class SchemaConverter:
|
||||||
else:
|
else:
|
||||||
add_component(t, is_required=True)
|
add_component(t, is_required=True)
|
||||||
|
|
||||||
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
|
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
|
||||||
|
|
||||||
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
|
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
|
||||||
items = schema.get('items') or schema['prefixItems']
|
items = schema.get('items') or schema['prefixItems']
|
||||||
|
@ -691,7 +691,7 @@ class SchemaConverter:
|
||||||
required_props = [k for k in sorted_props if k in required]
|
required_props = [k for k in sorted_props if k in required]
|
||||||
optional_props = [k for k in sorted_props if k not in required]
|
optional_props = [k for k in sorted_props if k not in required]
|
||||||
|
|
||||||
if additional_properties != False:
|
if additional_properties is not None and additional_properties != False:
|
||||||
sub_name = f'{name}{"-" if name else ""}additional'
|
sub_name = f'{name}{"-" if name else ""}additional'
|
||||||
value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
|
value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
|
||||||
self._add_primitive('value', PRIMITIVE_RULES['value'])
|
self._add_primitive('value', PRIMITIVE_RULES['value'])
|
||||||
|
|
|
@ -1,55 +0,0 @@
|
||||||
|
|
||||||
# For more information about using CMake with Android Studio, read the
|
|
||||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
|
||||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
|
||||||
|
|
||||||
# Sets the minimum CMake version required for this project.
|
|
||||||
cmake_minimum_required(VERSION 3.22.1)
|
|
||||||
|
|
||||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
|
||||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
|
||||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
|
||||||
# build script scope).
|
|
||||||
project("llama-android")
|
|
||||||
|
|
||||||
## Fetch latest llama.cpp from GitHub
|
|
||||||
#include(FetchContent)
|
|
||||||
#FetchContent_Declare(
|
|
||||||
# llama
|
|
||||||
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
|
||||||
# GIT_TAG master
|
|
||||||
#)
|
|
||||||
#
|
|
||||||
## Also provides "common"
|
|
||||||
#FetchContent_MakeAvailable(llama)
|
|
||||||
|
|
||||||
# llama.cpp CI uses the code from the current branch
|
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
|
|
||||||
add_subdirectory(../../../../../../ build-llama)
|
|
||||||
|
|
||||||
# Creates and names a library, sets it as either STATIC
|
|
||||||
# or SHARED, and provides the relative paths to its source code.
|
|
||||||
# You can define multiple libraries, and CMake builds them for you.
|
|
||||||
# Gradle automatically packages shared libraries with your APK.
|
|
||||||
#
|
|
||||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
|
||||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
|
||||||
# is preferred for the same purpose.
|
|
||||||
#
|
|
||||||
# In order to load a library into your app from Java/Kotlin, you must call
|
|
||||||
# System.loadLibrary() and pass the name of the library defined here;
|
|
||||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
|
||||||
# used in the AndroidManifest.xml file.
|
|
||||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
|
||||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
|
||||||
llama-android.cpp)
|
|
||||||
|
|
||||||
# Specifies libraries CMake should link to your target library. You
|
|
||||||
# can link libraries from various origins, such as libraries defined in this
|
|
||||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
|
||||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
|
||||||
# List libraries link to the target library
|
|
||||||
llama
|
|
||||||
common
|
|
||||||
android
|
|
||||||
log)
|
|
|
@ -11,15 +11,15 @@ cmake_minimum_required(VERSION 3.22.1)
|
||||||
# build script scope).
|
# build script scope).
|
||||||
project("llama-android")
|
project("llama-android")
|
||||||
|
|
||||||
include(FetchContent)
|
#include(FetchContent)
|
||||||
FetchContent_Declare(
|
#FetchContent_Declare(
|
||||||
llama
|
# llama
|
||||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||||
GIT_TAG master
|
# GIT_TAG master
|
||||||
)
|
#)
|
||||||
|
|
||||||
# Also provides "common"
|
# Also provides "common"
|
||||||
FetchContent_MakeAvailable(llama)
|
#FetchContent_MakeAvailable(llama)
|
||||||
|
|
||||||
# Creates and names a library, sets it as either STATIC
|
# Creates and names a library, sets it as either STATIC
|
||||||
# or SHARED, and provides the relative paths to its source code.
|
# or SHARED, and provides the relative paths to its source code.
|
||||||
|
@ -30,6 +30,10 @@ FetchContent_MakeAvailable(llama)
|
||||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||||
# is preferred for the same purpose.
|
# is preferred for the same purpose.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
#load local llama.cpp
|
||||||
|
add_subdirectory(../../../../../../ build-llama)
|
||||||
|
|
||||||
# In order to load a library into your app from Java/Kotlin, you must call
|
# In order to load a library into your app from Java/Kotlin, you must call
|
||||||
# System.loadLibrary() and pass the name of the library defined here;
|
# System.loadLibrary() and pass the name of the library defined here;
|
||||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||||
|
|
|
@ -10,4 +10,3 @@ More info:
|
||||||
|
|
||||||
https://github.com/ggerganov/llama.cpp/pull/4484
|
https://github.com/ggerganov/llama.cpp/pull/4484
|
||||||
https://github.com/ggerganov/llama.cpp/issues/4226
|
https://github.com/ggerganov/llama.cpp/issues/4226
|
||||||
|
|
||||||
|
|
1
examples/main-cmake-pkg/.gitignore
vendored
1
examples/main-cmake-pkg/.gitignore
vendored
|
@ -48,4 +48,3 @@
|
||||||
build*/
|
build*/
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
|
|
@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path})
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
|
|
|
@ -263,7 +263,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
{
|
{
|
||||||
auto prompt = params.conversation
|
auto prompt = (params.conversation && params.enable_chat_template)
|
||||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||||
: params.prompt;
|
: params.prompt;
|
||||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
|
@ -829,7 +829,9 @@ int main(int argc, char ** argv) {
|
||||||
is_antiprompt = true;
|
is_antiprompt = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
|
if (params.enable_chat_template) {
|
||||||
|
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
||||||
|
}
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
@ -891,12 +893,13 @@ int main(int argc, char ** argv) {
|
||||||
string_process_escapes(buffer);
|
string_process_escapes(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string user_inp = params.conversation
|
bool format_chat = params.conversation && params.enable_chat_template;
|
||||||
|
std::string user_inp = format_chat
|
||||||
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||||
: std::move(buffer);
|
: std::move(buffer);
|
||||||
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
|
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
||||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||||
|
|
|
@ -31,4 +31,3 @@ for i in range(n-1):
|
||||||
embedding2 = np.array(result[j])
|
embedding2 = np.array(result[j])
|
||||||
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
|
||||||
print(f"Similarity between {i} and {j}: {similarity:.2f}")
|
print(f"Similarity between {i} and {j}: {similarity:.2f}")
|
||||||
|
|
||||||
|
|
|
@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||||
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||||
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||||
|
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||||
|
|
||||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||||
|
|
|
@ -259,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
|
||||||
const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
|
const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
|
||||||
|
|
||||||
const NON_LITERAL_SET = new Set('|.()[]{}*+?');
|
const NON_LITERAL_SET = new Set('|.()[]{}*+?');
|
||||||
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
|
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
|
||||||
|
|
||||||
export class SchemaConverter {
|
export class SchemaConverter {
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
|
@ -751,7 +751,7 @@ export class SchemaConverter {
|
||||||
const requiredProps = sortedProps.filter(k => required.has(k));
|
const requiredProps = sortedProps.filter(k => required.has(k));
|
||||||
const optionalProps = sortedProps.filter(k => !required.has(k));
|
const optionalProps = sortedProps.filter(k => !required.has(k));
|
||||||
|
|
||||||
if (additionalProperties !== false) {
|
if (additionalProperties) {
|
||||||
const subName = `${name ?? ''}${name ? '-' : ''}additional`;
|
const subName = `${name ?? ''}${name ? '-' : ''}additional`;
|
||||||
const valueRule =
|
const valueRule =
|
||||||
additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
|
additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
|
||||||
|
|
|
@ -2020,6 +2020,7 @@ struct server_context {
|
||||||
slot.t_start_generation = 0;
|
slot.t_start_generation = 0;
|
||||||
|
|
||||||
if (slot.infill) {
|
if (slot.infill) {
|
||||||
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
bool suff_rm_leading_spc = true;
|
bool suff_rm_leading_spc = true;
|
||||||
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||||
params.input_suffix.erase(0, 1);
|
params.input_suffix.erase(0, 1);
|
||||||
|
@ -2035,16 +2036,21 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
|
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
||||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
|
||||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
||||||
|
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
||||||
|
if (add_bos) {
|
||||||
|
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||||
|
}
|
||||||
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||||
|
|
||||||
const llama_token middle_token = llama_token_middle(model);
|
const llama_token middle_token = llama_token_middle(model);
|
||||||
if (middle_token >= 0) {
|
if (middle_token >= 0) {
|
||||||
prefix_tokens.push_back(middle_token);
|
embd_inp.push_back(middle_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_tokens = prefix_tokens;
|
prompt_tokens = embd_inp;
|
||||||
} else {
|
} else {
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,4 +52,3 @@ Feature: Passkey / Self-extend with context shift
|
||||||
#| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
|
#| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 |
|
||||||
#| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
|
#| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0
|
||||||
# 987 |
|
# 987 |
|
||||||
|
|
||||||
|
|
|
@ -1054,4 +1054,3 @@
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|
||||||
|
|
|
@ -1058,4 +1058,3 @@
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|
||||||
|
|
|
@ -34,4 +34,3 @@ fi
|
||||||
|
|
||||||
#use multiple GPUs with same max compute units
|
#use multiple GPUs with same max compute units
|
||||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||||
|
|
||||||
|
|
|
@ -31,4 +31,3 @@ exit /B 0
|
||||||
:ERROR
|
:ERROR
|
||||||
echo comomand error: %errorlevel%
|
echo comomand error: %errorlevel%
|
||||||
exit /B %errorlevel%
|
exit /B %errorlevel%
|
||||||
|
|
||||||
|
|
|
@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||||
|
|
||||||
|
|
||||||
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
||||||
|
|
||||||
|
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1718318537,
|
"lastModified": 1719506693,
|
||||||
"narHash": "sha256-4Zu0RYRcAY/VWuu6awwq4opuiD//ahpc2aFHg2CWqFY=",
|
"narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "e9ee548d90ff586a6471b4ae80ae9cfcbceb3420",
|
"rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -63,4 +63,3 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -486,9 +486,11 @@ if (GGML_SYCL)
|
||||||
add_compile_options(-I./) #include DPCT
|
add_compile_options(-I./) #include DPCT
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
|
||||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||||
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||||
|
else()
|
||||||
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
|
file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
|
||||||
|
@ -1166,7 +1168,9 @@ target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
|
||||||
|
|
||||||
find_library(MATH_LIBRARY m)
|
find_library(MATH_LIBRARY m)
|
||||||
if (MATH_LIBRARY)
|
if (MATH_LIBRARY)
|
||||||
target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
|
if (NOT WIN32 OR NOT GGML_SYCL)
|
||||||
|
target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
|
|
@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
|
||||||
#define QR6_K 2
|
#define QR6_K 2
|
||||||
|
|
||||||
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
||||||
#define QR2_XXS 8
|
#define QR2_XXS 4
|
||||||
|
|
||||||
#define QI2_XS (QK_K / (4*QR2_XS))
|
#define QI2_XS (QK_K / (4*QR2_XS))
|
||||||
#define QR2_XS 8
|
#define QR2_XS 4
|
||||||
|
|
||||||
#define QI2_S (QK_K / (4*QR2_S))
|
#define QI2_S (QK_K / (4*QR2_S))
|
||||||
#define QR2_S 8
|
#define QR2_S 4
|
||||||
|
|
||||||
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
||||||
#define QR3_XXS 8
|
#define QR3_XXS 4
|
||||||
|
|
||||||
#define QI3_XS (QK_K / (4*QR3_XS))
|
#define QI3_XS (QK_K / (4*QR3_XS))
|
||||||
#define QR3_XS 8
|
#define QR3_XS 4
|
||||||
|
|
||||||
#define QI1_S (QK_K / (4*QR1_S))
|
#define QI1_S (QK_K / (4*QR1_S))
|
||||||
#define QR1_S 8
|
#define QR1_S 8
|
||||||
|
@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
|
||||||
#define QR4_NL 2
|
#define QR4_NL 2
|
||||||
|
|
||||||
#define QI4_XS (QK_K / (4*QR4_XS))
|
#define QI4_XS (QK_K / (4*QR4_XS))
|
||||||
#define QR4_XS 8
|
#define QR4_XS 2
|
||||||
|
|
||||||
#define QI3_S (QK_K / (4*QR3_S))
|
#define QI3_S (QK_K / (4*QR3_S))
|
||||||
#define QR3_S 8
|
#define QR3_S 4
|
||||||
|
|
||||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||||
|
|
||||||
|
|
|
@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
|
// if mmvq is available it's a better choice than dmmv:
|
||||||
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||||
|
#endif // GGML_CUDA_FORCE_DMMV
|
||||||
|
|
||||||
bool any_gpus_with_slow_fp16 = false;
|
bool any_gpus_with_slow_fp16 = false;
|
||||||
|
|
||||||
if (split) {
|
if (split) {
|
||||||
|
@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
}
|
}
|
||||||
|
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
|
||||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
|
||||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// if mmvq is available it's a better choice than dmmv:
|
|
||||||
#ifndef GGML_CUDA_FORCE_DMMV
|
|
||||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
|
||||||
#endif // GGML_CUDA_FORCE_DMMV
|
|
||||||
|
|
||||||
// debug helpers
|
// debug helpers
|
||||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
||||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
||||||
|
@ -2713,27 +2711,40 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
{
|
{
|
||||||
struct ggml_tensor * a;
|
struct ggml_tensor * a = op->src[0];
|
||||||
struct ggml_tensor * b;
|
|
||||||
if (op->op == GGML_OP_MUL_MAT) {
|
if (op->op == GGML_OP_MUL_MAT) {
|
||||||
a = op->src[0];
|
struct ggml_tensor * b = op->src[1];
|
||||||
b = op->src[1];
|
if (a->ne[3] != b->ne[3]) {
|
||||||
} else {
|
|
||||||
a = op->src[2];
|
|
||||||
b = op->src[1];
|
|
||||||
}
|
|
||||||
if (a->ne[3] != b->ne[3]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ggml_type a_type = a->type;
|
|
||||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
|
|
||||||
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
|
|
||||||
a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
|
|
||||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
switch (a->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
case GGML_TYPE_F16:
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_Q8_K:
|
||||||
|
case GGML_TYPE_IQ1_M:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
case GGML_TYPE_IQ2_XXS:
|
||||||
|
case GGML_TYPE_IQ3_S:
|
||||||
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
case GGML_TYPE_IQ4_XS:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
{
|
{
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
|
@ -268,30 +269,15 @@ static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigne
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
#elif defined(RDNA3)
|
unsigned int c;
|
||||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
#pragma unroll
|
||||||
int tmp1;
|
for (int i = 0; i < 4; ++i) {
|
||||||
int tmp2;
|
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||||
asm("\n \
|
}
|
||||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
|
||||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
|
||||||
v_add3_u32 %0, %1, %2, %0 \n \
|
|
||||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
|
||||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
|
||||||
v_add3_u32 %0, %1, %2, %0 \n \
|
|
||||||
"
|
|
||||||
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
|
||||||
: "v"(a), "v"(b)
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
|
||||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
|
||||||
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
|
||||||
#endif
|
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -467,8 +453,48 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||||
}
|
}
|
||||||
#endif // CUDART_VERSION < 12000
|
#endif // CUDART_VERSION < 12000
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||||
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||||
|
#elif defined(RDNA3)
|
||||||
|
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||||
|
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||||
|
int tmp1;
|
||||||
|
int tmp2;
|
||||||
|
asm("\n \
|
||||||
|
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
||||||
|
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
||||||
|
v_add3_u32 %0, %1, %2, %0 \n \
|
||||||
|
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
||||||
|
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
||||||
|
v_add3_u32 %0, %1, %2, %0 \n \
|
||||||
|
"
|
||||||
|
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
||||||
|
: "v"(a), "v"(b)
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||||
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||||
|
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||||
|
#endif
|
||||||
|
return c;
|
||||||
|
|
||||||
|
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
return __dp4a(a, b, c);
|
||||||
|
#else // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
const int8_t * a8 = (const int8_t *) &a;
|
||||||
|
const int8_t * b8 = (const int8_t *) &b;
|
||||||
|
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||||
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||||
|
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||||
|
|
||||||
|
|
|
@ -487,4 +487,3 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -54,12 +54,11 @@ typedef float (*vec_dot_KQ_f32_t)(
|
||||||
template<typename T, int D>
|
template<typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
const block_q4_0 * K_q4_0 = (const block_q4_0 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
|
||||||
half sum = 0.0f;
|
T sum = 0.0f;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
for (int k_KQ_0 = 0; k_KQ_0 < D/sizeof(int); k_KQ_0 += WARP_SIZE) {
|
||||||
|
@ -72,7 +71,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||||
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
const int v = (get_int_from_uint8(K_q4_0[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||||
|
|
||||||
const int sumi = __dp4a(v, u, 0);
|
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||||
|
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
if (std::is_same<T, half>::value) {
|
if (std::is_same<T, half>::value) {
|
||||||
|
@ -90,19 +89,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_0(
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
const block_q4_1 * K_q4_1 = (const block_q4_1 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
@ -120,7 +111,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||||
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
const int v = (get_int_from_uint8_aligned(K_q4_1[ib].qs, iqs4) >> shift) & 0x0F0F0F0F;
|
||||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||||
|
|
||||||
const int sumi = __dp4a(v, u, 0);
|
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||||
|
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
if (std::is_same<T, half>::value) {
|
if (std::is_same<T, half>::value) {
|
||||||
|
@ -142,19 +133,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q4_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
const block_q5_0 * K_q5_0 = (const block_q5_0 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
@ -179,7 +162,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||||
|
|
||||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||||
|
|
||||||
const int sumi = __dp4a(v, u, 0);
|
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||||
|
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
if (std::is_same<T, half>::value) {
|
if (std::is_same<T, half>::value) {
|
||||||
|
@ -197,19 +180,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_0(
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int D>
|
template<typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
const block_q5_1 * K_q5_1 = (const block_q5_1 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
@ -234,7 +209,7 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||||
|
|
||||||
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
const int u = Q_q8[k_KQ_0/WARP_SIZE];
|
||||||
|
|
||||||
const int sumi = __dp4a(v, u, 0);
|
const int sumi = ggml_cuda_dp4a(v, u, 0);
|
||||||
|
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
if (std::is_same<T, half>::value) {
|
if (std::is_same<T, half>::value) {
|
||||||
|
@ -256,19 +231,11 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q5_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int D>
|
template <typename T, int D>
|
||||||
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8, const void * __restrict__ Q_ds_v) {
|
||||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
|
|
||||||
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
const block_q8_0 * K_q8_0 = (const block_q8_0 *) K_c;
|
||||||
GGML_UNUSED(Q_v);
|
GGML_UNUSED(Q_v);
|
||||||
|
@ -297,13 +264,6 @@ static __device__ __forceinline__ T vec_dot_fattn_vec_KQ_q8_0(
|
||||||
}
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
#else
|
|
||||||
GGML_UNUSED(K_c);
|
|
||||||
GGML_UNUSED(Q_v);
|
|
||||||
GGML_UNUSED(Q_q8);
|
|
||||||
GGML_UNUSED(Q_ds_v);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, int D>
|
template <typename T, int D>
|
||||||
|
|
|
@ -2475,7 +2475,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||||
|
|
||||||
const dim3 block_nums_mmq(nsm, 1, 1);
|
const dim3 block_nums_mmq(nsm, 1, 1);
|
||||||
|
|
||||||
ggml_cuda_pool & pool = ctx.pool();
|
ggml_cuda_pool & pool = ctx.pool(id);
|
||||||
ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
|
ggml_cuda_pool_alloc<float> tmp_fixup(pool, block_nums_mmq.x * mmq_x*mmq_y);
|
||||||
|
|
||||||
if (args.ne01 % mmq_y == 0) {
|
if (args.ne01 % mmq_y == 0) {
|
||||||
|
|
|
@ -28,16 +28,22 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
|
||||||
|
|
||||||
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||||
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
|
||||||
type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
|
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ :
|
||||||
|
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ :
|
||||||
1;
|
1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -6537,4 +6537,3 @@ template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t
|
||||||
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
||||||
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
||||||
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
||||||
|
|
||||||
|
|
|
@ -130,4 +130,3 @@ void iq3xs_free_impl(int grid_size);
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -74,51 +74,6 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg
|
||||||
const float *src1_dd, float *dst_dd,
|
const float *src1_dd, float *dst_dd,
|
||||||
const queue_ptr &main_stream);
|
const queue_ptr &main_stream);
|
||||||
|
|
||||||
static __dpct_inline__ float warp_reduce_sum(float x,
|
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
/*
|
|
||||||
DPCT1096:98: The right-most dimension of the work-group used in the SYCL
|
|
||||||
kernel that calls this function may be less than "32". The function
|
|
||||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
|
||||||
CPU device. Modify the size of the work-group to ensure that the value
|
|
||||||
of the right-most dimension is a multiple of "32".
|
|
||||||
*/
|
|
||||||
x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __dpct_inline__ sycl::float2
|
|
||||||
warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
|
|
||||||
mask);
|
|
||||||
a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
|
|
||||||
mask);
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __dpct_inline__ float warp_reduce_max(float x,
|
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
/*
|
|
||||||
DPCT1096:97: The right-most dimension of the work-group used in the SYCL
|
|
||||||
kernel that calls this function may be less than "32". The function
|
|
||||||
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
|
||||||
CPU device. Modify the size of the work-group to ensure that the value
|
|
||||||
of the right-most dimension is a multiple of "32".
|
|
||||||
*/
|
|
||||||
x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
|
|
||||||
item_ct1.get_sub_group(), x, mask));
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
||||||
return b;
|
return b;
|
||||||
GGML_UNUSED(a);
|
GGML_UNUSED(a);
|
||||||
|
@ -336,47 +291,6 @@ static void sqr_f32(const float * x, float * dst, const int k,
|
||||||
dst[i] = x[i] * x[i];
|
dst[i] = x[i] * x[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
|
|
||||||
const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum, int block_size) {
|
|
||||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
|
||||||
item_ct1.get_local_id(1);
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
|
||||||
|
|
||||||
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
|
||||||
const float xi = x[row*ncols + col];
|
|
||||||
mean_var.x() += xi;
|
|
||||||
mean_var.y() += xi * xi;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum up partial sums
|
|
||||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
|
|
||||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
||||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
||||||
if (lane_id == 0) {
|
|
||||||
s_sum[warp_id] = mean_var;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
|
||||||
converged control flow. You may need to adjust the code.
|
|
||||||
*/
|
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
||||||
mean_var = s_sum[lane_id];
|
|
||||||
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const float mean = mean_var.x() / ncols;
|
|
||||||
const float var = mean_var.y() / ncols - mean * mean;
|
|
||||||
const float inv_std = sycl::rsqrt(var + eps);
|
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
|
||||||
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
static void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02,
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
int nidx = item_ct1.get_local_id(2) +
|
int nidx = item_ct1.get_local_id(2) +
|
||||||
|
@ -444,126 +358,11 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
|
template<int QUANT_BLOCK_TILE>
|
||||||
const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
|
|
||||||
int start = item_ct1.get_group(2) * group_size;
|
|
||||||
int end = start + group_size;
|
|
||||||
|
|
||||||
start += item_ct1.get_local_id(2);
|
|
||||||
|
|
||||||
if (end >= ne_elements) {
|
|
||||||
end = ne_elements;
|
|
||||||
}
|
|
||||||
|
|
||||||
float tmp = 0.0f; // partial sum for thread in warp
|
|
||||||
|
|
||||||
for (int j = start; j < end; j += block_size) {
|
|
||||||
tmp += x[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
|
|
||||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
||||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
||||||
if (lane_id == 0) {
|
|
||||||
s_sum[warp_id] = tmp;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1118:1: SYCL group functions and algorithms must be encountered in
|
|
||||||
converged control flow. You may need to adjust the code.
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
|
|
||||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
|
||||||
better performance if there is no access to global memory.
|
|
||||||
*/
|
|
||||||
item_ct1.barrier();
|
|
||||||
tmp = s_sum[lane_id];
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
}
|
|
||||||
|
|
||||||
float mean = tmp / group_size;
|
|
||||||
tmp = 0.0f;
|
|
||||||
|
|
||||||
for (int j = start; j < end; j += block_size) {
|
|
||||||
float xi = x[j] - mean;
|
|
||||||
dst[j] = xi;
|
|
||||||
tmp += xi * xi;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
|
|
||||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
||||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
||||||
if (lane_id == 0) {
|
|
||||||
s_sum[warp_id] = tmp;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1118:2: SYCL group functions and algorithms must be encountered in
|
|
||||||
converged control flow. You may need to adjust the code.
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
|
|
||||||
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
|
||||||
better performance if there is no access to global memory.
|
|
||||||
*/
|
|
||||||
item_ct1.barrier();
|
|
||||||
tmp = s_sum[lane_id];
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
}
|
|
||||||
|
|
||||||
float variance = tmp / group_size;
|
|
||||||
float scale = sycl::rsqrt(variance + eps);
|
|
||||||
for (int j = start; j < end; j += block_size) {
|
|
||||||
dst[j] *= scale;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
|
|
||||||
const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
|
|
||||||
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
|
||||||
item_ct1.get_local_id(1);
|
|
||||||
const int tid = item_ct1.get_local_id(2);
|
|
||||||
|
|
||||||
float tmp = 0.0f; // partial sum for thread in warp
|
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
|
||||||
const float xi = x[row*ncols + col];
|
|
||||||
tmp += xi * xi;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum up partial sums
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
if (block_size > WARP_SIZE) {
|
|
||||||
|
|
||||||
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
|
||||||
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
|
||||||
if (lane_id == 0) {
|
|
||||||
s_sum[warp_id] = tmp;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
|
||||||
converged control flow. You may need to adjust the code.
|
|
||||||
*/
|
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
|
||||||
tmp = s_sum[lane_id];
|
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
|
||||||
}
|
|
||||||
|
|
||||||
const float mean = tmp / ncols;
|
|
||||||
const float scale = sycl::rsqrt(mean + eps);
|
|
||||||
|
|
||||||
for (int col = tid; col < ncols; col += block_size) {
|
|
||||||
dst[row*ncols + col] = scale * x[row*ncols + col];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
const int ix = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||||
item_ct1.get_local_id(2);
|
item_ct1.get_local_id(2)) * QUANT_BLOCK_TILE;
|
||||||
|
|
||||||
if (ix >= kx_padded) {
|
if (ix >= kx_padded) {
|
||||||
return;
|
return;
|
||||||
|
@ -578,23 +377,39 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
||||||
|
|
||||||
const int ib = i_padded / QK8_1; // block index
|
const int ib = i_padded / QK8_1; // block index
|
||||||
const int iqs = i_padded % QK8_1; // quant index
|
const int iqs = i_padded % QK8_1; // quant index
|
||||||
|
typedef sycl::vec<float, QUANT_BLOCK_TILE> TC;
|
||||||
const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
|
typedef sycl::vec<int8_t, QUANT_BLOCK_TILE> TQ;
|
||||||
float amax = sycl::fabs((float)xi);
|
TC zeros;
|
||||||
float sum = xi;
|
TQ qzeros;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int i = 0; i < QUANT_BLOCK_TILE; i++)
|
||||||
amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
|
{
|
||||||
item_ct1.get_sub_group(), amax, mask));
|
zeros[i] = 0.f;
|
||||||
sum +=
|
qzeros[i] = 0;
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
|
|
||||||
}
|
}
|
||||||
|
const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
|
||||||
|
float sum = xi[0];
|
||||||
|
float amax = sycl::fabs(xi[0]);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 1; i < QUANT_BLOCK_TILE; i++)
|
||||||
|
{
|
||||||
|
sum += xi[i];
|
||||||
|
amax = sycl::fmax(sycl::fabs(xi[i]), amax);
|
||||||
|
}
|
||||||
|
sum = warp_reduce_sum(sum, item_ct1);
|
||||||
|
amax = warp_reduce_max(amax, item_ct1);
|
||||||
|
|
||||||
const float d = amax / 127;
|
const float d = amax / 127;
|
||||||
const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
|
TQ q = qzeros;
|
||||||
|
if (amax != 0.0f)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < QUANT_BLOCK_TILE; i++) {
|
||||||
|
q[i] = sycl::round(xi[i] / d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
y[ib].qs[iqs] = q;
|
*(TQ *)&y[ib].qs[iqs] = q;
|
||||||
|
|
||||||
if (iqs > 0) {
|
if (iqs > 0) {
|
||||||
return;
|
return;
|
||||||
|
@ -728,7 +543,7 @@ static void mul_mat_p021_f16_f32(
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -781,7 +596,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -978,114 +793,6 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
|
||||||
const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
|
|
||||||
return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct rope_corr_dims {
|
|
||||||
float v[4];
|
|
||||||
};
|
|
||||||
|
|
||||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
|
||||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
|
||||||
static void rope_yarn(
|
|
||||||
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
|
||||||
float * cos_theta, float * sin_theta
|
|
||||||
) {
|
|
||||||
// Get n-d rotational scaling corrected for extrapolation
|
|
||||||
float theta_interp = freq_scale * theta_extrap;
|
|
||||||
float theta = theta_interp;
|
|
||||||
if (ext_factor != 0.0f) {
|
|
||||||
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
|
||||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
|
||||||
|
|
||||||
// Get n-d magnitude scaling corrected for interpolation
|
|
||||||
mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
|
|
||||||
}
|
|
||||||
*cos_theta = sycl::cos(theta) * mscale;
|
|
||||||
*sin_theta = sycl::sin(theta) * mscale;
|
|
||||||
}
|
|
||||||
|
|
||||||
// rope == RoPE == rotary positional embedding
|
|
||||||
template<typename T, bool has_pos>
|
|
||||||
static void rope(
|
|
||||||
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
|
||||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
|
||||||
,
|
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
|
||||||
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
||||||
item_ct1.get_local_id(1));
|
|
||||||
|
|
||||||
if (col >= ncols) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
||||||
item_ct1.get_local_id(2);
|
|
||||||
const int i = row*ncols + col;
|
|
||||||
const int i2 = row/p_delta_rows;
|
|
||||||
|
|
||||||
const int p = has_pos ? pos[i2] : 0;
|
|
||||||
const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
|
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
|
||||||
rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
|
||||||
|
|
||||||
const float x0 = x[i + 0];
|
|
||||||
const float x1 = x[i + 1];
|
|
||||||
|
|
||||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
|
||||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T, bool has_pos, bool has_freq_facs>
|
|
||||||
static void rope_neox(
|
|
||||||
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
|
||||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
|
|
||||||
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
|
|
||||||
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
||||||
item_ct1.get_local_id(1));
|
|
||||||
|
|
||||||
if (col >= ncols) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
||||||
item_ct1.get_local_id(2);
|
|
||||||
const int ib = col / n_dims;
|
|
||||||
const int ic = col % n_dims;
|
|
||||||
|
|
||||||
if (ib > 0) {
|
|
||||||
const int i = row*ncols + ib*n_dims + ic;
|
|
||||||
|
|
||||||
dst[i + 0] = x[i + 0];
|
|
||||||
dst[i + 1] = x[i + 1];
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int i = row*ncols + ib*n_dims + ic/2;
|
|
||||||
const int i2 = row/p_delta_rows;
|
|
||||||
|
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
|
||||||
|
|
||||||
const int p = has_pos ? pos[i2] : 0;
|
|
||||||
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
|
||||||
|
|
||||||
const float theta_base =
|
|
||||||
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
|
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
|
||||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
|
||||||
|
|
||||||
const float x0 = x[i + 0];
|
|
||||||
const float x1 = x[i + n_dims/2];
|
|
||||||
|
|
||||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
|
||||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
const int row = item_ct1.get_group(1);
|
const int row = item_ct1.get_group(1);
|
||||||
|
@ -1751,99 +1458,6 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void norm_f32_sycl(const float *x, float *dst, const int ncols,
|
|
||||||
const int nrows, const float eps,
|
|
||||||
queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
||||||
if (ncols < 1024) {
|
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
|
||||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
|
||||||
sycl::range<1>(32), cgh);
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
|
||||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
const int work_group_size = get_work_group_size(stream->get_device());
|
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
|
||||||
/*
|
|
||||||
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
|
||||||
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
|
||||||
sycl::range<1>(32), cgh);
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
|
||||||
norm_f32(x, dst, ncols, eps, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void group_norm_f32_sycl(const float *x, float *dst,
|
|
||||||
const int num_groups, const int group_size,
|
|
||||||
const int ne_elements, queue_ptr stream) {
|
|
||||||
static const float eps = 1e-6f;
|
|
||||||
if (group_size < 1024) {
|
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
|
||||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
|
||||||
cgh);
|
|
||||||
|
|
||||||
const float eps_ct4 = eps;
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
|
||||||
group_norm_f32(
|
|
||||||
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
const int work_group_size = get_work_group_size(stream->get_device());
|
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
|
||||||
/*
|
|
||||||
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
|
||||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
|
||||||
cgh);
|
|
||||||
|
|
||||||
const float eps_ct4 = eps;
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
|
||||||
group_norm_f32(x, dst, group_size, ne_elements,
|
|
||||||
eps_ct4, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
static void concat_f32_sycl(const float *x, const float *y, float *dst,
|
||||||
const int ne0, int ne1, int ne2, int ne02,
|
const int ne0, int ne1, int ne2, int ne02,
|
||||||
queue_ptr stream) {
|
queue_ptr stream) {
|
||||||
|
@ -1885,64 +1499,22 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
|
|
||||||
const int nrows, const float eps,
|
|
||||||
queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
||||||
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
|
||||||
if (ncols < 1024) {
|
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
|
||||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
|
||||||
cgh);
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
|
||||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), WARP_SIZE);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
const int work_group_size = get_work_group_size(stream->get_device());
|
|
||||||
const sycl::range<3> block_dims(1, 1, work_group_size);
|
|
||||||
/*
|
|
||||||
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
|
||||||
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
|
|
||||||
cgh);
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
|
||||||
block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
|
||||||
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
|
||||||
s_sum_acc_ct1.get_pointer(), work_group_size);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
||||||
const int ky, const int kx_padded,
|
const int ky, const int kx_padded,
|
||||||
queue_ptr stream) {
|
queue_ptr stream) {
|
||||||
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
|
||||||
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
const sycl::range<3> num_blocks(1, ky, block_num_x);
|
||||||
const sycl::range<3> block_size(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE);
|
int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
|
||||||
|
static_assert(QK8_1 % WARP_SIZE == 0);
|
||||||
|
const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
|
||||||
{
|
{
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
{sycl::aspect::fp16});
|
{sycl::aspect::fp16});
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
sycl::nd_range<3>(num_blocks * block_size, block_size),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
|
quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1962,7 +1534,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
|
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
|
||||||
nchannels_y, item_ct1);
|
nchannels_y, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -1982,7 +1554,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
|
||||||
row_stride_x, channel_stride_x,
|
row_stride_x, channel_stride_x,
|
||||||
nchannels_y / nchannels_x, item_ct1);
|
nchannels_y / nchannels_x, item_ct1);
|
||||||
|
@ -2241,117 +1813,13 @@ static void clamp_f32_sycl(const float *x, float *dst, const float min,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
|
|
||||||
const int32_t *pos, float freq_scale, int p_delta_rows,
|
|
||||||
float freq_base, float ext_factor, float attn_factor,
|
|
||||||
rope_corr_dims corr_dims, queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % 2 == 0);
|
|
||||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
|
||||||
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
|
||||||
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
|
|
||||||
if (pos == nullptr) {
|
|
||||||
/*
|
|
||||||
DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
|
||||||
freq_base, ext_factor, attn_factor, corr_dims,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
|
|
||||||
the limit. To get the device limit, query
|
|
||||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
|
||||||
*/
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
|
|
||||||
freq_base, ext_factor, attn_factor, corr_dims,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|
||||||
const int32_t *pos, float freq_scale,
|
|
||||||
int p_delta_rows, float freq_base, float ext_factor,
|
|
||||||
float attn_factor, rope_corr_dims corr_dims,
|
|
||||||
const float * freq_factors, queue_ptr stream) {
|
|
||||||
GGML_ASSERT(ncols % 2 == 0);
|
|
||||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
|
||||||
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
|
||||||
const sycl::range<3> block_nums(1, num_blocks_x, nrows);
|
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
||||||
const float inv_ndims = -1.0f / n_dims;
|
|
||||||
|
|
||||||
if (pos == nullptr) {
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
if (freq_factors == nullptr) {
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
||||||
p_delta_rows, ext_factor, attn_factor,
|
|
||||||
corr_dims, theta_scale, inv_ndims, freq_factors,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
||||||
p_delta_rows, ext_factor, attn_factor,
|
|
||||||
corr_dims, theta_scale, inv_ndims, freq_factors,
|
|
||||||
item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
dpct::has_capability_or_fail(stream->get_device(),
|
|
||||||
{sycl::aspect::fp16});
|
|
||||||
|
|
||||||
if (freq_factors == nullptr) {
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
||||||
p_delta_rows, ext_factor, attn_factor,
|
|
||||||
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
stream->parallel_for(
|
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
|
||||||
p_delta_rows, ext_factor, attn_factor,
|
|
||||||
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
||||||
const int nrows, queue_ptr stream) {
|
const int nrows, queue_ptr stream) {
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
const sycl::range<3> block_nums(1, nrows, 1);
|
const sycl::range<3> block_nums(1, nrows, 1);
|
||||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
k_sum_rows_f32(x, dst, ncols, item_ct1);
|
k_sum_rows_f32(x, dst, ncols, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -2432,7 +1900,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
|
||||||
|
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
||||||
nrows_y, scale, max_bias, m0,
|
nrows_y, scale, max_bias, m0,
|
||||||
m1, n_head_log2, item_ct1,
|
m1, n_head_log2, item_ct1,
|
||||||
|
@ -2612,12 +2080,6 @@ static inline int get_sycl_env(const char *env_name, int default_val) {
|
||||||
return user_number;
|
return user_number;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int get_work_group_size(const sycl::device& device) {
|
|
||||||
dpct::device_info prop;
|
|
||||||
dpct::get_device_info(prop, device);
|
|
||||||
return prop.get_max_work_group_size();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_check_sycl() try {
|
static void ggml_check_sycl() try {
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
|
@ -3176,45 +2638,6 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
(void) src1_dd;
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
||||||
ggml_tensor *dst, const float *src0_dd,
|
|
||||||
const float *src1_dd, float *dst_dd,
|
|
||||||
const queue_ptr &main_stream) {
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
float eps;
|
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
|
||||||
|
|
||||||
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) dst;
|
|
||||||
(void) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ggml_sycl_op_group_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
|
||||||
const float *src0_dd, const float *src1_dd,
|
|
||||||
float *dst_dd,
|
|
||||||
const queue_ptr &main_stream) {
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
int num_groups = dst->op_params[0];
|
|
||||||
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
|
||||||
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) dst;
|
|
||||||
(void) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
const float *src0_dd, const float *src1_dd,
|
const float *src0_dd, const float *src1_dd,
|
||||||
|
@ -3278,28 +2701,6 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
(void) src1_dd;
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
|
||||||
const float *src0_dd, const float *src1_dd,
|
|
||||||
float *dst_dd,
|
|
||||||
const queue_ptr &main_stream) {
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
float eps;
|
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
|
||||||
|
|
||||||
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) dst;
|
|
||||||
(void) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
|
static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYCL_MAX_DEVICES> & tensor_split) {
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
int64_t max_compute_capability = INT_MIN;
|
int64_t max_compute_capability = INT_MIN;
|
||||||
|
@ -3461,97 +2862,6 @@ catch (sycl::exception const &exc) {
|
||||||
std::exit(1);
|
std::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
||||||
ggml_tensor *dst, const float *src0_dd,
|
|
||||||
const float *src1_dd, float *dst_dd,
|
|
||||||
const queue_ptr &main_stream) {
|
|
||||||
const ggml_tensor * src2 = dst->src[2];
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
||||||
GGML_ASSERT(src0->type == dst->type);
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
|
||||||
const int64_t ne01 = src0->ne[1];
|
|
||||||
const int64_t ne2 = dst->ne[2];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
|
||||||
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
||||||
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
||||||
|
|
||||||
// RoPE alteration for extended context
|
|
||||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
||||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
||||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
||||||
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
|
||||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
||||||
|
|
||||||
const float * freq_factors = nullptr;
|
|
||||||
const int32_t * pos = nullptr;
|
|
||||||
if ((mode & 1) == 0) {
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
||||||
GGML_ASSERT(src1->ne[0] == ne2);
|
|
||||||
pos = (const int32_t *) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
|
||||||
|
|
||||||
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
|
||||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
|
||||||
|
|
||||||
if (is_neox) {
|
|
||||||
pos = (const int32_t *) src1_dd;
|
|
||||||
|
|
||||||
if (src2 != nullptr) {
|
|
||||||
freq_factors = (const float *) src2->data;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
|
||||||
}
|
|
||||||
|
|
||||||
rope_corr_dims corr_dims;
|
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
|
|
||||||
|
|
||||||
// compute
|
|
||||||
if (is_neox) {
|
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
|
||||||
rope_neox_sycl(
|
|
||||||
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
|
||||||
attn_factor, corr_dims, freq_factors, main_stream
|
|
||||||
);
|
|
||||||
} else if (src0->type == GGML_TYPE_F16) {
|
|
||||||
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
|
||||||
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
|
||||||
freq_base, ext_factor, attn_factor, corr_dims,
|
|
||||||
freq_factors, main_stream);
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
|
||||||
rope_sycl(
|
|
||||||
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
|
||||||
attn_factor, corr_dims, main_stream
|
|
||||||
);
|
|
||||||
} else if (src0->type == GGML_TYPE_F16) {
|
|
||||||
rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
|
|
||||||
nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
|
||||||
attn_factor, corr_dims, main_stream);
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) dst;
|
|
||||||
(void) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
const float *src0_dd, const float *src1_dd,
|
const float *src0_dd, const float *src1_dd,
|
||||||
|
@ -4576,7 +3886,6 @@ bool ggml_sycl_supports_dmmv(enum ggml_type type) {
|
||||||
|
|
||||||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
|
|
||||||
if (split) {
|
if (split) {
|
||||||
|
@ -6241,7 +5550,9 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
|
return true;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
|
|
|
@ -19,5 +19,7 @@
|
||||||
#include "dmmv.hpp"
|
#include "dmmv.hpp"
|
||||||
#include "mmq.hpp"
|
#include "mmq.hpp"
|
||||||
#include "mmvq.hpp"
|
#include "mmvq.hpp"
|
||||||
|
#include "rope.hpp"
|
||||||
|
#include "norm.hpp"
|
||||||
|
|
||||||
#endif // GGML_SYCL_BACKEND_HPP
|
#endif // GGML_SYCL_BACKEND_HPP
|
||||||
|
|
|
@ -295,5 +295,60 @@ struct ggml_backend_sycl_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// common host functions
|
||||||
|
|
||||||
|
static inline int get_work_group_size(const sycl::device& device) {
|
||||||
|
dpct::device_info prop;
|
||||||
|
dpct::get_device_info(prop, device);
|
||||||
|
return prop.get_max_work_group_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// common device functions
|
||||||
|
|
||||||
|
static __dpct_inline__ float warp_reduce_sum(float x,
|
||||||
|
const sycl::nd_item<3>& item_ct1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
|
/*
|
||||||
|
DPCT1096:98: The right-most dimension of the work-group used in the SYCL
|
||||||
|
kernel that calls this function may be less than "32". The function
|
||||||
|
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||||
|
CPU device. Modify the size of the work-group to ensure that the value
|
||||||
|
of the right-most dimension is a multiple of "32".
|
||||||
|
*/
|
||||||
|
x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ sycl::float2
|
||||||
|
warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
|
a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
|
||||||
|
mask);
|
||||||
|
a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
|
||||||
|
mask);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __dpct_inline__ float warp_reduce_max(float x,
|
||||||
|
const sycl::nd_item<3>& item_ct1) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
|
/*
|
||||||
|
DPCT1096:97: The right-most dimension of the work-group used in the SYCL
|
||||||
|
kernel that calls this function may be less than "32". The function
|
||||||
|
"dpct::permute_sub_group_by_xor" may return an unexpected result on the
|
||||||
|
CPU device. Modify the size of the work-group to ensure that the value
|
||||||
|
of the right-most dimension is a multiple of "32".
|
||||||
|
*/
|
||||||
|
x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
|
||||||
|
item_ct1.get_sub_group(), x, mask));
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
#endif // GGML_SYCL_COMMON_HPP
|
#endif // GGML_SYCL_COMMON_HPP
|
||||||
|
|
|
@ -76,7 +76,7 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -104,7 +104,7 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
|
dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
|
||||||
nrows, item_ct1);
|
nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -227,7 +227,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -346,7 +346,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -499,7 +499,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -633,7 +633,7 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -748,7 +748,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -774,7 +774,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
|
||||||
vx, y, dst, ncols, nrows, item_ct1);
|
vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -795,7 +795,7 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
|
||||||
vx, y, dst, ncols, nrows, item_ct1);
|
vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -816,7 +816,7 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
|
||||||
vx, y, dst, ncols, nrows, item_ct1);
|
vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -837,7 +837,7 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
|
||||||
vx, y, dst, ncols, nrows, item_ct1);
|
vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -858,7 +858,7 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
|
||||||
|
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
|
||||||
vx, y, dst, ncols, nrows, item_ct1);
|
vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -873,10 +873,10 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
|
||||||
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, ny, 32);
|
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
|
dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -889,10 +889,10 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
|
||||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, ny, 32);
|
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
|
dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -905,10 +905,10 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
|
||||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, ny, 32);
|
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
|
dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -918,10 +918,10 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
|
||||||
const int nrows,
|
const int nrows,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const sycl::range<3> block_dims(1, 1, 32);
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
|
dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -934,10 +934,10 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
|
||||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, ny, 32);
|
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
|
||||||
stream->parallel_for(
|
stream->parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
|
dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,7 +37,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -85,7 +85,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -133,7 +133,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -181,7 +181,7 @@ static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -229,7 +229,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -277,7 +277,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -325,7 +325,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -373,7 +373,7 @@ static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -421,7 +421,7 @@ static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -470,7 +470,7 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
|
||||||
tmp +=
|
tmp +=
|
||||||
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
|
||||||
}
|
}
|
||||||
|
@ -495,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
|
||||||
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -519,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
|
||||||
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -543,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
|
||||||
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -567,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
|
||||||
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -591,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
|
||||||
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -615,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
|
||||||
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -639,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
|
||||||
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -663,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
|
||||||
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -687,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
|
||||||
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -711,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
|
||||||
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
|
@ -734,8 +734,8 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS, block_iq2_xxs, 1>(
|
mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -759,8 +759,8 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS, block_iq2_xs, 1>(
|
mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -784,8 +784,8 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S, block_iq2_s, 1>(
|
mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -809,8 +809,8 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS, block_iq3_xxs, 1>(
|
mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -833,8 +833,8 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_XS, block_iq3_s, 1>(
|
mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -858,7 +858,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -879,7 +879,7 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -901,7 +901,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
|
mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
|
@ -923,8 +923,8 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
[[intel::reqd_sub_group_size(32)]] {
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS, block_iq4_xs, 1>(
|
mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
|
||||||
vx, vy, dst, ncols, nrows, item_ct1);
|
vx, vy, dst, ncols, nrows, item_ct1);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -936,7 +936,7 @@ void ggml_sycl_op_mul_mat_vec_q(
|
||||||
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
|
const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
|
||||||
float *dst_dd_i, const int64_t row_low, const int64_t row_high,
|
float *dst_dd_i, const int64_t row_low, const int64_t row_high,
|
||||||
const int64_t src1_ncols, const int64_t src1_padded_row_size,
|
const int64_t src1_ncols, const int64_t src1_padded_col_size,
|
||||||
const dpct::queue_ptr &stream) {
|
const dpct::queue_ptr &stream) {
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
@ -948,77 +948,80 @@ void ggml_sycl_op_mul_mat_vec_q(
|
||||||
int id;
|
int id;
|
||||||
SYCL_CHECK(
|
SYCL_CHECK(
|
||||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||||
|
const size_t q8_1_ts = sizeof(block_q8_1);
|
||||||
|
const size_t q8_1_bs = QK8_1;
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||||
const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
|
const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
|
||||||
|
for (int i = 0; i < src1_ncols; i++)
|
||||||
switch (src0->type) {
|
{
|
||||||
|
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
|
||||||
|
const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
|
||||||
|
float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
|
||||||
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ1_S:
|
case GGML_TYPE_IQ1_S:
|
||||||
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ1_M:
|
case GGML_TYPE_IQ1_M:
|
||||||
mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ2_XXS:
|
case GGML_TYPE_IQ2_XXS:
|
||||||
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ2_S:
|
case GGML_TYPE_IQ2_S:
|
||||||
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ3_S:
|
case GGML_TYPE_IQ3_S:
|
||||||
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
(void) src1_ddf_i;
|
(void) src1_ddf_i;
|
||||||
(void) src1_ncols;
|
|
||||||
(void) src1_padded_row_size;
|
|
||||||
}
|
}
|
||||||
|
|
370
ggml/src/ggml-sycl/norm.cpp
Normal file
370
ggml/src/ggml-sycl/norm.cpp
Normal file
|
@ -0,0 +1,370 @@
|
||||||
|
#include "norm.hpp"
|
||||||
|
|
||||||
|
static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||||
|
const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
|
||||||
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||||
|
item_ct1.get_local_id(1);
|
||||||
|
const int tid = item_ct1.get_local_id(2);
|
||||||
|
|
||||||
|
const int nthreads = item_ct1.get_local_range(2);
|
||||||
|
const int nwarps = nthreads / WARP_SIZE;
|
||||||
|
assert(nwarps % WARP_SIZE == 0);
|
||||||
|
sycl::float2 mean_var = sycl::float2(0.f, 0.f);
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
|
const float xi = x[row * ncols + col];
|
||||||
|
mean_var.x() += xi;
|
||||||
|
mean_var.y() += xi * xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums
|
||||||
|
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||||
|
if (block_size > WARP_SIZE) {
|
||||||
|
|
||||||
|
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
|
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
if (lane_id == 0) {
|
||||||
|
s_sum[warp_id] = mean_var;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1118:0: SYCL group functions and algorithms must be encountered in
|
||||||
|
converged control flow. You may need to adjust the code.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
|
mean_var = 0.f;
|
||||||
|
int nreduce = nwarps / WARP_SIZE;
|
||||||
|
for (size_t i = 0; i < nreduce; i += 1)
|
||||||
|
{
|
||||||
|
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
||||||
|
}
|
||||||
|
mean_var = warp_reduce_sum(mean_var, item_ct1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float mean = mean_var.x() / ncols;
|
||||||
|
const float var = mean_var.y() / ncols - mean * mean;
|
||||||
|
const float inv_std = sycl::rsqrt(var + eps);
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
|
dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void group_norm_f32(const float* x, float* dst, const int group_size, const int ne_elements, const float eps,
|
||||||
|
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||||
|
int start = item_ct1.get_group(2) * group_size;
|
||||||
|
int end = start + group_size;
|
||||||
|
const int nthreads = item_ct1.get_local_range(2);
|
||||||
|
const int nwarps = nthreads / WARP_SIZE;
|
||||||
|
assert(nwarps % WARP_SIZE == 0);
|
||||||
|
start += item_ct1.get_local_id(2);
|
||||||
|
|
||||||
|
if (end >= ne_elements) {
|
||||||
|
end = ne_elements;
|
||||||
|
}
|
||||||
|
|
||||||
|
float tmp = 0.0f; // partial sum for thread in warp
|
||||||
|
|
||||||
|
for (int j = start; j < end; j += block_size) {
|
||||||
|
tmp += x[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
if (block_size > WARP_SIZE) {
|
||||||
|
|
||||||
|
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
|
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
if (lane_id == 0) {
|
||||||
|
s_sum[warp_id] = tmp;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1118:1: SYCL group functions and algorithms must be encountered in
|
||||||
|
converged control flow. You may need to adjust the code.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
|
||||||
|
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||||
|
better performance if there is no access to global memory.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier();
|
||||||
|
tmp = 0.f;
|
||||||
|
int nreduce = nwarps / WARP_SIZE;
|
||||||
|
for (size_t i = 0; i < nreduce; i += 1)
|
||||||
|
{
|
||||||
|
tmp += s_sum[lane_id + i * WARP_SIZE];
|
||||||
|
}
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
}
|
||||||
|
|
||||||
|
float mean = tmp / group_size;
|
||||||
|
tmp = 0.0f;
|
||||||
|
|
||||||
|
for (int j = start; j < end; j += block_size) {
|
||||||
|
float xi = x[j] - mean;
|
||||||
|
dst[j] = xi;
|
||||||
|
tmp += xi * xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
if (block_size > WARP_SIZE) {
|
||||||
|
|
||||||
|
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
|
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
if (lane_id == 0) {
|
||||||
|
s_sum[warp_id] = tmp;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1118:2: SYCL group functions and algorithms must be encountered in
|
||||||
|
converged control flow. You may need to adjust the code.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
|
||||||
|
sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
|
||||||
|
better performance if there is no access to global memory.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier();
|
||||||
|
tmp = s_sum[lane_id];
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
}
|
||||||
|
|
||||||
|
float variance = tmp / group_size;
|
||||||
|
float scale = sycl::rsqrt(variance + eps);
|
||||||
|
for (int j = start; j < end; j += block_size) {
|
||||||
|
dst[j] *= scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
|
||||||
|
const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
|
||||||
|
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
||||||
|
item_ct1.get_local_id(1);
|
||||||
|
const int tid = item_ct1.get_local_id(2);
|
||||||
|
const int nthreads = item_ct1.get_local_range(2);
|
||||||
|
const int nwarps = nthreads / WARP_SIZE;
|
||||||
|
assert(nwarps % WARP_SIZE == 0);
|
||||||
|
float tmp = 0.0f; // partial sum for thread in warp
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
|
const float xi = x[row * ncols + col];
|
||||||
|
tmp += xi * xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
if (block_size > WARP_SIZE) {
|
||||||
|
|
||||||
|
int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
|
int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
if (lane_id == 0) {
|
||||||
|
s_sum[warp_id] = tmp;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
DPCT1118:3: SYCL group functions and algorithms must be encountered in
|
||||||
|
converged control flow. You may need to adjust the code.
|
||||||
|
*/
|
||||||
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
|
int nreduce = nwarps / WARP_SIZE;
|
||||||
|
tmp = 0.f;
|
||||||
|
for (size_t i = 0; i < nreduce; i += 1)
|
||||||
|
{
|
||||||
|
tmp += s_sum[lane_id + i * WARP_SIZE];
|
||||||
|
}
|
||||||
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float mean = tmp / ncols;
|
||||||
|
const float scale = sycl::rsqrt(mean + eps);
|
||||||
|
|
||||||
|
for (int col = tid; col < ncols; col += block_size) {
|
||||||
|
dst[row * ncols + col] = scale * x[row * ncols + col];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||||
|
const int nrows, const float eps,
|
||||||
|
queue_ptr stream) {
|
||||||
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||||
|
if (ncols < 1024) {
|
||||||
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||||
|
nullptr, WARP_SIZE);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int work_group_size = get_work_group_size(stream->get_device());
|
||||||
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
|
/*
|
||||||
|
DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
the limit. To get the device limit, query
|
||||||
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||||
|
*/
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
|
||||||
|
sycl::range<1>(work_group_size / WARP_SIZE), cgh);
|
||||||
|
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
norm_f32(x, dst, ncols, eps, item_ct1,
|
||||||
|
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void group_norm_f32_sycl(const float* x, float* dst,
|
||||||
|
const int num_groups, const int group_size,
|
||||||
|
const int ne_elements, queue_ptr stream) {
|
||||||
|
static const float eps = 1e-6f;
|
||||||
|
if (group_size < 1024) {
|
||||||
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
const float eps_ct4 = eps;
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
group_norm_f32(
|
||||||
|
x, dst, group_size, ne_elements, eps_ct4, item_ct1,
|
||||||
|
nullptr, WARP_SIZE);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int work_group_size = get_work_group_size(stream->get_device());
|
||||||
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
|
/*
|
||||||
|
DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
the limit. To get the device limit, query
|
||||||
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||||
|
cgh);
|
||||||
|
|
||||||
|
const float eps_ct4 = eps;
|
||||||
|
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
group_norm_f32(x, dst, group_size, ne_elements,
|
||||||
|
eps_ct4, item_ct1,
|
||||||
|
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
|
||||||
|
const int nrows, const float eps,
|
||||||
|
queue_ptr stream) {
|
||||||
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||||
|
// printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
|
||||||
|
if (ncols < 1024) {
|
||||||
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||||
|
nullptr, WARP_SIZE);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int work_group_size = get_work_group_size(stream->get_device());
|
||||||
|
const sycl::range<3> block_dims(1, 1, work_group_size);
|
||||||
|
/*
|
||||||
|
DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
the limit. To get the device limit, query
|
||||||
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||||
|
*/
|
||||||
|
stream->submit([&](sycl::handler& cgh) {
|
||||||
|
sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
|
||||||
|
cgh);
|
||||||
|
cgh.parallel_for(
|
||||||
|
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
|
||||||
|
block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
|
||||||
|
rms_norm_f32(x, dst, ncols, eps, item_ct1,
|
||||||
|
s_sum_acc_ct1.get_pointer(), work_group_size);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
|
||||||
|
ggml_tensor* dst, const float* src0_dd,
|
||||||
|
const float* src1_dd, float* dst_dd,
|
||||||
|
const queue_ptr& main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
||||||
|
|
||||||
|
(void)src1;
|
||||||
|
(void)dst;
|
||||||
|
(void)src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
const ggml_tensor* src1, ggml_tensor* dst,
|
||||||
|
const float* src0_dd, const float* src1_dd,
|
||||||
|
float* dst_dd,
|
||||||
|
const queue_ptr& main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
int num_groups = dst->op_params[0];
|
||||||
|
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
||||||
|
group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
|
||||||
|
|
||||||
|
(void)src1;
|
||||||
|
(void)dst;
|
||||||
|
(void)src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
const ggml_tensor* src1, ggml_tensor* dst,
|
||||||
|
const float* src0_dd, const float* src1_dd,
|
||||||
|
float* dst_dd,
|
||||||
|
const queue_ptr& main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
float eps;
|
||||||
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
|
||||||
|
|
||||||
|
(void)src1;
|
||||||
|
(void)dst;
|
||||||
|
(void)src1_dd;
|
||||||
|
}
|
35
ggml/src/ggml-sycl/norm.hpp
Normal file
35
ggml/src/ggml-sycl/norm.hpp
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
//
|
||||||
|
// MIT license
|
||||||
|
// Copyright (C) 2024 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef GGML_SYCL_NORM_HPP
|
||||||
|
#define GGML_SYCL_NORM_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1,
|
||||||
|
ggml_tensor* dst, const float* src0_dd,
|
||||||
|
const float* src1_dd, float* dst_dd,
|
||||||
|
const queue_ptr& main_stream);
|
||||||
|
|
||||||
|
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
const ggml_tensor* src1, ggml_tensor* dst,
|
||||||
|
const float* src0_dd, const float* src1_dd,
|
||||||
|
float* dst_dd,
|
||||||
|
const queue_ptr& main_stream);
|
||||||
|
|
||||||
|
void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
const ggml_tensor* src1, ggml_tensor* dst,
|
||||||
|
const float* src0_dd, const float* src1_dd,
|
||||||
|
float* dst_dd,
|
||||||
|
const queue_ptr& main_stream);
|
||||||
|
|
||||||
|
#endif // GGML_SYCL_NORM_HPP
|
|
@ -16,7 +16,7 @@
|
||||||
#define GGML_SYCL_MAX_STREAMS 8
|
#define GGML_SYCL_MAX_STREAMS 8
|
||||||
#define GGML_SYCL_MAX_BUFFERS 256
|
#define GGML_SYCL_MAX_BUFFERS 256
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE GGML_SYCL_WARP_SIZE
|
||||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||||
|
|
||||||
#define SYCL_GELU_BLOCK_SIZE 256
|
#define SYCL_GELU_BLOCK_SIZE 256
|
||||||
|
|
275
ggml/src/ggml-sycl/rope.cpp
Normal file
275
ggml/src/ggml-sycl/rope.cpp
Normal file
|
@ -0,0 +1,275 @@
|
||||||
|
#include "rope.hpp"
|
||||||
|
|
||||||
|
struct rope_corr_dims {
|
||||||
|
float v[2];
|
||||||
|
};
|
||||||
|
|
||||||
|
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||||
|
const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
|
||||||
|
return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
|
||||||
|
}
|
||||||
|
|
||||||
|
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||||
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||||
|
static void rope_yarn(
|
||||||
|
float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
|
||||||
|
float * cos_theta, float * sin_theta) {
|
||||||
|
// Get n-d rotational scaling corrected for extrapolation
|
||||||
|
float theta_interp = freq_scale * theta_extrap;
|
||||||
|
float theta = theta_interp;
|
||||||
|
if (ext_factor != 0.0f) {
|
||||||
|
float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
|
||||||
|
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||||
|
|
||||||
|
// Get n-d magnitude scaling corrected for interpolation
|
||||||
|
mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
|
||||||
|
}
|
||||||
|
*cos_theta = sycl::cos(theta) * mscale;
|
||||||
|
*sin_theta = sycl::sin(theta) * mscale;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, bool has_ff>
|
||||||
|
static void rope_norm(
|
||||||
|
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
|
||||||
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
|
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||||
|
item_ct1.get_local_id(1));
|
||||||
|
|
||||||
|
if (i0 >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||||
|
item_ct1.get_local_id(2);
|
||||||
|
|
||||||
|
if (i0 >= n_dims) {
|
||||||
|
const int i = row*ne0 + i0;
|
||||||
|
|
||||||
|
dst[i + 0] = x[i + 0];
|
||||||
|
dst[i + 1] = x[i + 1];
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int i = row*ne0 + i0;
|
||||||
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
|
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||||
|
|
||||||
|
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||||
|
|
||||||
|
float cos_theta;
|
||||||
|
float sin_theta;
|
||||||
|
|
||||||
|
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
|
const float x0 = x[i + 0];
|
||||||
|
const float x1 = x[i + 1];
|
||||||
|
|
||||||
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, bool has_ff>
|
||||||
|
static void rope_neox(
|
||||||
|
const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
|
||||||
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
|
const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||||
|
item_ct1.get_local_id(1));
|
||||||
|
|
||||||
|
if (i0 >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
||||||
|
item_ct1.get_local_id(2);
|
||||||
|
|
||||||
|
if (i0 >= n_dims) {
|
||||||
|
const int i = row*ne0 + i0;
|
||||||
|
|
||||||
|
dst[i + 0] = x[i + 0];
|
||||||
|
dst[i + 1] = x[i + 1];
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int i = row*ne0 + i0/2;
|
||||||
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
|
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||||
|
|
||||||
|
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||||
|
|
||||||
|
float cos_theta;
|
||||||
|
float sin_theta;
|
||||||
|
|
||||||
|
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
|
const float x0 = x[i + 0];
|
||||||
|
const float x1 = x[i + n_dims/2];
|
||||||
|
|
||||||
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void rope_norm_sycl(
|
||||||
|
const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
|
||||||
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
||||||
|
GGML_ASSERT(ne0 % 2 == 0);
|
||||||
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||||
|
const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||||
|
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
||||||
|
|
||||||
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
|
||||||
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
|
{sycl::aspect::fp16});
|
||||||
|
|
||||||
|
if (freq_factors == nullptr) {
|
||||||
|
/*
|
||||||
|
DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
the limit. To get the device limit, query
|
||||||
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||||
|
*/
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
|
||||||
|
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
|
||||||
|
the limit. To get the device limit, query
|
||||||
|
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||||
|
*/
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
|
||||||
|
ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static void rope_neox_sycl(
|
||||||
|
const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
|
||||||
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
|
||||||
|
GGML_ASSERT(ne0 % 2 == 0);
|
||||||
|
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||||
|
const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||||
|
const sycl::range<3> block_nums(1, num_blocks_x, nr);
|
||||||
|
|
||||||
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
|
||||||
|
dpct::has_capability_or_fail(stream->get_device(),
|
||||||
|
{sycl::aspect::fp16});
|
||||||
|
|
||||||
|
if (freq_factors == nullptr) {
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale,
|
||||||
|
p_delta_rows, ext_factor, attn_factor,
|
||||||
|
corr_dims, theta_scale, freq_factors,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
stream->parallel_for(
|
||||||
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
|
[=](sycl::nd_item<3> item_ct1) {
|
||||||
|
rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale,
|
||||||
|
p_delta_rows, ext_factor, attn_factor,
|
||||||
|
corr_dims, theta_scale, freq_factors,
|
||||||
|
item_ct1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_sycl_op_rope(
|
||||||
|
ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
|
const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) {
|
||||||
|
const ggml_tensor * src2 = dst->src[2];
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||||
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t ne01 = src0->ne[1];
|
||||||
|
const int64_t nr = ggml_nrows(src0);
|
||||||
|
|
||||||
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||||
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
|
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||||
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
||||||
|
|
||||||
|
// RoPE alteration for extended context
|
||||||
|
float freq_base;
|
||||||
|
float freq_scale;
|
||||||
|
float ext_factor;
|
||||||
|
float attn_factor;
|
||||||
|
float beta_fast;
|
||||||
|
float beta_slow;
|
||||||
|
|
||||||
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||||
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
||||||
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
||||||
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||||
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
|
||||||
|
const bool is_neox = mode & 2;
|
||||||
|
|
||||||
|
const int32_t * pos = (const int32_t *) src1_dd;
|
||||||
|
|
||||||
|
const float * freq_factors = nullptr;
|
||||||
|
if (src2 != nullptr) {
|
||||||
|
freq_factors = (const float *) src2->data;
|
||||||
|
}
|
||||||
|
|
||||||
|
rope_corr_dims corr_dims;
|
||||||
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
|
||||||
|
|
||||||
|
// compute
|
||||||
|
if (is_neox) {
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
rope_neox_sycl(
|
||||||
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
|
attn_factor, corr_dims, freq_factors, main_stream
|
||||||
|
);
|
||||||
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
|
rope_neox_sycl(
|
||||||
|
(const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
|
attn_factor, corr_dims, freq_factors, main_stream
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
rope_norm_sycl(
|
||||||
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
|
attn_factor, corr_dims, freq_factors, main_stream
|
||||||
|
);
|
||||||
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
|
rope_norm_sycl(
|
||||||
|
(const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
|
attn_factor, corr_dims, freq_factors, main_stream
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(void) src1;
|
||||||
|
(void) dst;
|
||||||
|
(void) src1_dd;
|
||||||
|
}
|
22
ggml/src/ggml-sycl/rope.hpp
Normal file
22
ggml/src/ggml-sycl/rope.hpp
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
//
|
||||||
|
// MIT license
|
||||||
|
// Copyright (C) 2024 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef GGML_SYCL_ROPE_HPP
|
||||||
|
#define GGML_SYCL_ROPE_HPP
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
|
||||||
|
void ggml_sycl_op_rope(
|
||||||
|
ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
|
const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream);
|
||||||
|
|
||||||
|
#endif // GGML_SYCL_ROPE_HPP
|
|
@ -820,7 +820,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
||||||
|
|
||||||
#if QR2_XXS == 8
|
|
||||||
const int ib32 = iqs;
|
const int ib32 = iqs;
|
||||||
const uint16_t * q2 = bq2->qs + 4*ib32;
|
const uint16_t * q2 = bq2->qs + 4*ib32;
|
||||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
const uint8_t * aux8 = (const uint8_t *)q2;
|
||||||
|
@ -838,26 +837,6 @@ vec_dot_iq2_xxs_q8_1(const void *__restrict__ vbq,
|
||||||
}
|
}
|
||||||
const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
|
const float d = (float)bq2->d * (0.5f + aux32) * bq8_1[ib32].ds[0] * 0.25f;
|
||||||
return d * sumi;
|
return d * sumi;
|
||||||
#else
|
|
||||||
// iqs is 0...15
|
|
||||||
const int ib32 = iqs/2;
|
|
||||||
const int il = iqs%2;
|
|
||||||
const uint16_t * q2 = bq2->qs + 4*ib32;
|
|
||||||
const uint8_t * aux8 = (const uint8_t *)q2;
|
|
||||||
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
|
||||||
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
|
||||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
|
||||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * bq8_1[ib32].ds[0] * 0.25f;
|
|
||||||
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
|
||||||
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
|
||||||
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
|
||||||
int sumi1 = 0, sumi2 = 0;
|
|
||||||
for (int j = 0; j < 8; ++j) {
|
|
||||||
sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1);
|
|
||||||
sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1);
|
|
||||||
}
|
|
||||||
return d * (sumi1 + sumi2);
|
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
assert(false);
|
assert(false);
|
||||||
return 0.f;
|
return 0.f;
|
||||||
|
|
|
@ -144954,4 +144954,3 @@ unsigned char sum_rows_f32_data[] = {
|
||||||
|
|
||||||
};
|
};
|
||||||
const uint64_t sum_rows_f32_len = 2112;
|
const uint64_t sum_rows_f32_len = 2112;
|
||||||
|
|
||||||
|
|
|
@ -50,6 +50,8 @@ class Keys:
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||||
|
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
||||||
|
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -64,6 +66,7 @@ class Keys:
|
||||||
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
||||||
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
||||||
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
||||||
|
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -150,6 +153,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
INTERNLM2 = auto()
|
INTERNLM2 = auto()
|
||||||
MINICPM = auto()
|
MINICPM = auto()
|
||||||
GEMMA = auto()
|
GEMMA = auto()
|
||||||
|
GEMMA2 = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
XVERSE = auto()
|
XVERSE = auto()
|
||||||
|
@ -160,6 +164,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
DEEPSEEK2 = auto()
|
DEEPSEEK2 = auto()
|
||||||
BITNET = auto()
|
BITNET = auto()
|
||||||
T5 = auto()
|
T5 = auto()
|
||||||
|
JAIS = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -180,10 +185,13 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_NORM = auto()
|
ATTN_NORM = auto()
|
||||||
ATTN_NORM_2 = auto()
|
ATTN_NORM_2 = auto()
|
||||||
ATTN_OUT_NORM = auto()
|
ATTN_OUT_NORM = auto()
|
||||||
|
ATTN_POST_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
FFN_GATE_INP_SHEXP = auto()
|
FFN_GATE_INP_SHEXP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
|
FFN_PRE_NORM = auto()
|
||||||
|
FFN_POST_NORM = auto()
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
FFN_UP = auto()
|
FFN_UP = auto()
|
||||||
|
@ -270,6 +278,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||||
MODEL_ARCH.MINICPM: "minicpm",
|
MODEL_ARCH.MINICPM: "minicpm",
|
||||||
MODEL_ARCH.GEMMA: "gemma",
|
MODEL_ARCH.GEMMA: "gemma",
|
||||||
|
MODEL_ARCH.GEMMA2: "gemma2",
|
||||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
MODEL_ARCH.XVERSE: "xverse",
|
MODEL_ARCH.XVERSE: "xverse",
|
||||||
|
@ -280,6 +289,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||||
MODEL_ARCH.BITNET: "bitnet",
|
MODEL_ARCH.BITNET: "bitnet",
|
||||||
MODEL_ARCH.T5: "t5",
|
MODEL_ARCH.T5: "t5",
|
||||||
|
MODEL_ARCH.JAIS: "jais",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -303,9 +313,12 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
|
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
|
@ -751,6 +764,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.FFN_NORM,
|
MODEL_TENSOR.FFN_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.GEMMA2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_POST_NORM,
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM,
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.STARCODER2: [
|
MODEL_ARCH.STARCODER2: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -928,6 +956,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ENC_FFN_UP,
|
MODEL_TENSOR.ENC_FFN_UP,
|
||||||
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.JAIS: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -516,6 +516,12 @@ class GGUFWriter:
|
||||||
def add_logit_scale(self, value: float) -> None:
|
def add_logit_scale(self, value: float) -> None:
|
||||||
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_attn_logit_softcapping(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_final_logit_softcapping(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_expert_count(self, count: int) -> None:
|
def add_expert_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
@ -546,6 +552,9 @@ class GGUFWriter:
|
||||||
def add_relative_attn_buckets_count(self, value: int) -> None:
|
def add_relative_attn_buckets_count(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_sliding_window(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ class TensorNameMap:
|
||||||
# Token embeddings
|
# Token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD: (
|
MODEL_TENSOR.TOKEN_EMBD: (
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx
|
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"word_embeddings", # bloom
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
|
@ -49,7 +49,7 @@ class TensorNameMap:
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais
|
||||||
"output", # llama-pth bloom internlm2
|
"output", # llama-pth bloom internlm2
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
"lm_head.linear", # phi2
|
"lm_head.linear", # phi2
|
||||||
|
@ -58,7 +58,7 @@ class TensorNameMap:
|
||||||
# Output norm
|
# Output norm
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
"transformer.ln_f", # gpt2 gpt-j falcon jais
|
||||||
"model.norm", # llama-hf baichuan internlm2
|
"model.norm", # llama-hf baichuan internlm2
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
"transformer.norm_f", # mpt dbrx
|
"transformer.norm_f", # mpt dbrx
|
||||||
|
@ -81,7 +81,7 @@ class TensorNameMap:
|
||||||
# Attention norm
|
# Attention norm
|
||||||
MODEL_TENSOR.ATTN_NORM: (
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
"h.{bid}.input_layernorm", # bloom
|
"h.{bid}.input_layernorm", # bloom
|
||||||
|
@ -109,7 +109,7 @@ class TensorNameMap:
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
|
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
|
@ -160,7 +160,7 @@ class TensorNameMap:
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen
|
"transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"h.{bid}.self_attention.dense", # bloom
|
"h.{bid}.self_attention.dense", # bloom
|
||||||
|
@ -187,6 +187,10 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
|
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||||
|
"model.layers.{bid}.post_attention_layernorm", # gemma2
|
||||||
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||||
|
@ -198,7 +202,7 @@ class TensorNameMap:
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2 refact qwen
|
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais
|
||||||
"h.{bid}.post_attention_layernorm", # bloom
|
"h.{bid}.post_attention_layernorm", # bloom
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
|
@ -210,6 +214,16 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
|
# Post feed-forward norm
|
||||||
|
MODEL_TENSOR.FFN_PRE_NORM: (
|
||||||
|
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
|
||||||
|
),
|
||||||
|
|
||||||
|
# Post feed-forward norm
|
||||||
|
MODEL_TENSOR.FFN_POST_NORM: (
|
||||||
|
"model.layers.{bid}.post_feedforward_layernorm", # gemma2
|
||||||
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
|
@ -225,7 +239,7 @@ class TensorNameMap:
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
MODEL_TENSOR.FFN_UP: (
|
MODEL_TENSOR.FFN_UP: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
"transformer.h.{bid}.mlp.c_fc", # gpt2 jais
|
||||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||||
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
||||||
|
@ -271,6 +285,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||||
"transformer.h.{bid}.mlp.w2", # qwen
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
|
"transformer.h.{bid}.mlp.c_fc2", # jais
|
||||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
|
@ -294,7 +309,7 @@ class TensorNameMap:
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen
|
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||||
|
|
|
@ -126,19 +126,257 @@ You can use GBNF grammars:
|
||||||
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
- in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
|
||||||
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
- in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
|
||||||
|
|
||||||
Take a look at [tests](../../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
|
||||||
|
|
||||||
Here is also a non-exhaustive list of **unsupported** features:
|
```bash
|
||||||
|
llama-cli \
|
||||||
|
-hfr bartowski/Phi-3-medium-128k-instruct-GGUF \
|
||||||
|
-hff Phi-3-medium-128k-instruct-Q8_0.gguf \
|
||||||
|
-j '{
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"name": {
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"maxLength": 100
|
||||||
|
},
|
||||||
|
"age": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0,
|
||||||
|
"maximum": 150
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["name", "age"],
|
||||||
|
"additionalProperties": false
|
||||||
|
},
|
||||||
|
"minItems": 10,
|
||||||
|
"maxItems": 100
|
||||||
|
}' \
|
||||||
|
-p 'Generate a {name, age}[] JSON array with famous actors of all ages.'
|
||||||
|
```
|
||||||
|
|
||||||
- `additionalProperties`: to be fixed in https://github.com/ggerganov/llama.cpp/pull/7840
|
<details>
|
||||||
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`
|
|
||||||
- `integer` constraints to be implemented in https://github.com/ggerganov/llama.cpp/pull/7797
|
<summary>Show grammar</summary>
|
||||||
- Remote `$ref`s in the C++ version (Python & JavaScript versions fetch https refs)
|
|
||||||
- Mixing `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
You can convert any schema in command-line with:
|
||||||
- `string` formats `uri`, `email`
|
|
||||||
|
```bash
|
||||||
|
examples/json_schema_to_grammar.py name-age-schema.json
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||||
|
item ::= "{" space item-name-kv "," space item-age-kv "}" space
|
||||||
|
item-age ::= ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-4] [0-9] | [5] "0")) space
|
||||||
|
item-age-kv ::= "\"age\"" space ":" space item-age
|
||||||
|
item-name ::= "\"" char{1,100} "\"" space
|
||||||
|
item-name-kv ::= "\"name\"" space ":" space item-name
|
||||||
|
root ::= "[" space item ("," space item){9,99} "]" space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
Here is also a list of known limitations (contributions welcome):
|
||||||
|
|
||||||
|
- `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations).
|
||||||
|
- `"additionalProperties": true` may produce keys that contain unescaped newlines.
|
||||||
|
- Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
|
||||||
|
- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
|
||||||
|
- [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
|
||||||
|
- `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
|
||||||
|
- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
|
||||||
|
- [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
|
||||||
|
- Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
|
||||||
|
- `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
|
||||||
|
- No [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
|
||||||
|
|
||||||
|
And a non-exhaustive list of other unsupported features that are unlikely to be implemented (hard and/or too slow to support w/ stateless grammars):
|
||||||
|
|
||||||
|
- [`uniqueItems`](https://json-schema.org/draft/2020-12/json-schema-validation#name-uniqueitems)
|
||||||
- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
|
- [`contains`](https://json-schema.org/draft/2020-12/json-schema-core#name-contains) / `minContains`
|
||||||
- `uniqueItems`
|
|
||||||
- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
|
- `$anchor` (cf. [dereferencing](https://json-schema.org/draft/2020-12/json-schema-core#name-dereferencing))
|
||||||
- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
|
- [`not`](https://json-schema.org/draft/2020-12/json-schema-core#name-not)
|
||||||
- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
|
- [Conditionals](https://json-schema.org/draft/2020-12/json-schema-core#name-keywords-for-applying-subsche) `if` / `then` / `else` / `dependentSchemas`
|
||||||
- [`patternProperties`](https://json-schema.org/draft/2020-12/json-schema-core#name-patternproperties)
|
|
||||||
|
### A word about additionalProperties
|
||||||
|
|
||||||
|
> [!WARNING]
|
||||||
|
> The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default.
|
||||||
|
> Since this is slow and seems prone to hallucinations, we default to no additional properties.
|
||||||
|
> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties.
|
||||||
|
|
||||||
|
If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# pip install pydantic
|
||||||
|
import json
|
||||||
|
from typing import Annotated, List
|
||||||
|
from pydantic import BaseModel, Extra, Field
|
||||||
|
class QAPair(BaseModel):
|
||||||
|
class Config:
|
||||||
|
extra = 'allow' # triggers additionalProperties: true in the JSON schema
|
||||||
|
question: str
|
||||||
|
concise_answer: str
|
||||||
|
justification: str
|
||||||
|
|
||||||
|
class Summary(BaseModel):
|
||||||
|
class Config:
|
||||||
|
extra = 'allow'
|
||||||
|
key_facts: List[Annotated[str, Field(pattern='- .{5,}')]]
|
||||||
|
question_answers: List[Annotated[List[QAPair], Field(min_items=5)]]
|
||||||
|
|
||||||
|
print(json.dumps(Summary.model_json_schema(), indent=2))
|
||||||
|
```
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Show JSON schema & grammar</summary>
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"$defs": {
|
||||||
|
"QAPair": {
|
||||||
|
"additionalProperties": true,
|
||||||
|
"properties": {
|
||||||
|
"question": {
|
||||||
|
"title": "Question",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"concise_answer": {
|
||||||
|
"title": "Concise Answer",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"justification": {
|
||||||
|
"title": "Justification",
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"question",
|
||||||
|
"concise_answer",
|
||||||
|
"justification"
|
||||||
|
],
|
||||||
|
"title": "QAPair",
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"additionalProperties": true,
|
||||||
|
"properties": {
|
||||||
|
"key_facts": {
|
||||||
|
"items": {
|
||||||
|
"pattern": "^- .{5,}$",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"title": "Key Facts",
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
"question_answers": {
|
||||||
|
"items": {
|
||||||
|
"items": {
|
||||||
|
"$ref": "#/$defs/QAPair"
|
||||||
|
},
|
||||||
|
"minItems": 5,
|
||||||
|
"type": "array"
|
||||||
|
},
|
||||||
|
"title": "Question Answers",
|
||||||
|
"type": "array"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"key_facts",
|
||||||
|
"question_answers"
|
||||||
|
],
|
||||||
|
"title": "Summary",
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
QAPair ::= "{" space QAPair-question-kv "," space QAPair-concise-answer-kv "," space QAPair-justification-kv ( "," space ( QAPair-additional-kv ( "," space QAPair-additional-kv )* ) )? "}" space
|
||||||
|
QAPair-additional-k ::= ["] ( [c] ([o] ([n] ([c] ([i] ([s] ([e] ([_] ([a] ([n] ([s] ([w] ([e] ([r] char+ | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"e] char*) | [^"s] char*) | [^"i] char*) | [^"c] char*) | [^"n] char*) | [^"o] char*) | [j] ([u] ([s] ([t] ([i] ([f] ([i] ([c] ([a] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"a] char*) | [^"c] char*) | [^"i] char*) | [^"f] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"u] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] char+ | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"cjq] char* )? ["] space
|
||||||
|
QAPair-additional-kv ::= QAPair-additional-k ":" space value
|
||||||
|
QAPair-concise-answer-kv ::= "\"concise_answer\"" space ":" space string
|
||||||
|
QAPair-justification-kv ::= "\"justification\"" space ":" space string
|
||||||
|
QAPair-question-kv ::= "\"question\"" space ":" space string
|
||||||
|
additional-k ::= ["] ( [k] ([e] ([y] ([_] ([f] ([a] ([c] ([t] ([s] char+ | [^"s] char*) | [^"t] char*) | [^"c] char*) | [^"a] char*) | [^"f] char*) | [^"_] char*) | [^"y] char*) | [^"e] char*) | [q] ([u] ([e] ([s] ([t] ([i] ([o] ([n] ([_] ([a] ([n] ([s] ([w] ([e] ([r] ([s] char+ | [^"s] char*) | [^"r] char*) | [^"e] char*) | [^"w] char*) | [^"s] char*) | [^"n] char*) | [^"a] char*) | [^"_] char*) | [^"n] char*) | [^"o] char*) | [^"i] char*) | [^"t] char*) | [^"s] char*) | [^"e] char*) | [^"u] char*) | [^"kq] char* )? ["] space
|
||||||
|
additional-kv ::= additional-k ":" space value
|
||||||
|
array ::= "[" space ( value ("," space value)* )? "]" space
|
||||||
|
boolean ::= ("true" | "false") space
|
||||||
|
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||||
|
decimal-part ::= [0-9]{1,16}
|
||||||
|
dot ::= [^\x0A\x0D]
|
||||||
|
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||||
|
key-facts ::= "[" space (key-facts-item ("," space key-facts-item)*)? "]" space
|
||||||
|
key-facts-item ::= "\"" "- " key-facts-item-1{5,} "\"" space
|
||||||
|
key-facts-item-1 ::= dot
|
||||||
|
key-facts-kv ::= "\"key_facts\"" space ":" space key-facts
|
||||||
|
null ::= "null" space
|
||||||
|
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||||
|
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
||||||
|
question-answers ::= "[" space (question-answers-item ("," space question-answers-item)*)? "]" space
|
||||||
|
question-answers-item ::= "[" space question-answers-item-item ("," space question-answers-item-item){4,} "]" space
|
||||||
|
question-answers-item-item ::= QAPair
|
||||||
|
question-answers-kv ::= "\"question_answers\"" space ":" space question-answers
|
||||||
|
root ::= "{" space key-facts-kv "," space question-answers-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
string ::= "\"" char* "\"" space
|
||||||
|
value ::= object | array | string | number | boolean | null
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
If you're using [Zod](https://zod.dev/), you can make your objects to explicitly allow extra properties w/ `nonstrict()` / `passthrough()` (or explicitly no extra props w/ `z.object(...).strict()` or `z.strictObject(...)`) but note that [zod-to-json-schema](https://github.com/StefanTerdell/zod-to-json-schema) currently always sets `"additionalProperties": false` anyway.
|
||||||
|
|
||||||
|
```js
|
||||||
|
import { z } from 'zod';
|
||||||
|
import { zodToJsonSchema } from 'zod-to-json-schema';
|
||||||
|
|
||||||
|
const Foo = z.object({
|
||||||
|
age: z.number().positive(),
|
||||||
|
email: z.string().email(),
|
||||||
|
}).strict();
|
||||||
|
|
||||||
|
console.log(zodToJsonSchema(Foo));
|
||||||
|
```
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Show JSON schema & grammar</summary>
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"age": {
|
||||||
|
"type": "number",
|
||||||
|
"exclusiveMinimum": 0
|
||||||
|
},
|
||||||
|
"email": {
|
||||||
|
"type": "string",
|
||||||
|
"format": "email"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"age",
|
||||||
|
"email"
|
||||||
|
],
|
||||||
|
"additionalProperties": false,
|
||||||
|
"$schema": "http://json-schema.org/draft-07/schema#"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
age-kv ::= "\"age\"" space ":" space number
|
||||||
|
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
||||||
|
decimal-part ::= [0-9]{1,16}
|
||||||
|
email-kv ::= "\"email\"" space ":" space string
|
||||||
|
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||||
|
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||||
|
root ::= "{" space age-kv "," space email-kv "}" space
|
||||||
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
|
string ::= "\"" char* "\"" space
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
|
@ -88,6 +88,8 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_VIKING = 16,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_JAIS = 17,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
|
|
@ -210,4 +210,3 @@ fi
|
||||||
# more benches
|
# more benches
|
||||||
#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
||||||
#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
#GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1
|
||||||
|
|
||||||
|
|
577
src/llama.cpp
577
src/llama.cpp
|
@ -217,6 +217,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_INTERNLM2,
|
LLM_ARCH_INTERNLM2,
|
||||||
LLM_ARCH_MINICPM,
|
LLM_ARCH_MINICPM,
|
||||||
LLM_ARCH_GEMMA,
|
LLM_ARCH_GEMMA,
|
||||||
|
LLM_ARCH_GEMMA2,
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
LLM_ARCH_MAMBA,
|
LLM_ARCH_MAMBA,
|
||||||
LLM_ARCH_XVERSE,
|
LLM_ARCH_XVERSE,
|
||||||
|
@ -226,6 +227,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_ARCTIC,
|
LLM_ARCH_ARCTIC,
|
||||||
LLM_ARCH_DEEPSEEK2,
|
LLM_ARCH_DEEPSEEK2,
|
||||||
LLM_ARCH_BITNET,
|
LLM_ARCH_BITNET,
|
||||||
|
LLM_ARCH_JAIS,
|
||||||
LLM_ARCH_T5,
|
LLM_ARCH_T5,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
@ -257,6 +259,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||||
{ LLM_ARCH_MINICPM, "minicpm" },
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||||
{ LLM_ARCH_GEMMA, "gemma" },
|
{ LLM_ARCH_GEMMA, "gemma" },
|
||||||
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
||||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
{ LLM_ARCH_MAMBA, "mamba" },
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
{ LLM_ARCH_XVERSE, "xverse" },
|
{ LLM_ARCH_XVERSE, "xverse" },
|
||||||
|
@ -266,6 +269,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||||
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
||||||
{ LLM_ARCH_BITNET, "bitnet" },
|
{ LLM_ARCH_BITNET, "bitnet" },
|
||||||
|
{ LLM_ARCH_JAIS, "jais" },
|
||||||
{ LLM_ARCH_T5, "t5" },
|
{ LLM_ARCH_T5, "t5" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
@ -300,6 +304,8 @@ enum llm_kv {
|
||||||
LLM_KV_POOLING_TYPE,
|
LLM_KV_POOLING_TYPE,
|
||||||
LLM_KV_LOGIT_SCALE,
|
LLM_KV_LOGIT_SCALE,
|
||||||
LLM_KV_DECODER_START_TOKEN_ID,
|
LLM_KV_DECODER_START_TOKEN_ID,
|
||||||
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
||||||
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
||||||
|
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||||
|
@ -313,6 +319,7 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_Q_LORA_RANK,
|
LLM_KV_ATTENTION_Q_LORA_RANK,
|
||||||
LLM_KV_ATTENTION_KV_LORA_RANK,
|
LLM_KV_ATTENTION_KV_LORA_RANK,
|
||||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||||
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||||
|
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
LLM_KV_ROPE_FREQ_BASE,
|
LLM_KV_ROPE_FREQ_BASE,
|
||||||
|
@ -390,6 +397,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
||||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||||
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
||||||
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
||||||
|
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||||
|
@ -403,6 +412,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
||||||
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
||||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||||
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||||
|
|
||||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||||
|
@ -478,10 +488,12 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_NORM,
|
LLM_TENSOR_ATTN_NORM,
|
||||||
LLM_TENSOR_ATTN_NORM_2,
|
LLM_TENSOR_ATTN_NORM_2,
|
||||||
LLM_TENSOR_ATTN_OUT_NORM,
|
LLM_TENSOR_ATTN_OUT_NORM,
|
||||||
|
LLM_TENSOR_ATTN_POST_NORM,
|
||||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||||
LLM_TENSOR_FFN_GATE_INP,
|
LLM_TENSOR_FFN_GATE_INP,
|
||||||
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_NORM,
|
||||||
|
LLM_TENSOR_FFN_POST_NORM,
|
||||||
LLM_TENSOR_FFN_GATE,
|
LLM_TENSOR_FFN_GATE,
|
||||||
LLM_TENSOR_FFN_DOWN,
|
LLM_TENSOR_FFN_DOWN,
|
||||||
LLM_TENSOR_FFN_UP,
|
LLM_TENSOR_FFN_UP,
|
||||||
|
@ -1004,6 +1016,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_GEMMA2,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
{
|
{
|
||||||
|
@ -1173,6 +1203,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
|
{ LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_JAIS,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_T5,
|
LLM_ARCH_T5,
|
||||||
{
|
{
|
||||||
|
@ -2013,6 +2058,7 @@ enum e_model {
|
||||||
MODEL_780M,
|
MODEL_780M,
|
||||||
MODEL_0_5B,
|
MODEL_0_5B,
|
||||||
MODEL_1B,
|
MODEL_1B,
|
||||||
|
MODEL_1_3B,
|
||||||
MODEL_1_4B,
|
MODEL_1_4B,
|
||||||
MODEL_2B,
|
MODEL_2B,
|
||||||
MODEL_2_8B,
|
MODEL_2_8B,
|
||||||
|
@ -2045,6 +2091,9 @@ enum e_model {
|
||||||
MODEL_8x22B,
|
MODEL_8x22B,
|
||||||
MODEL_16x12B,
|
MODEL_16x12B,
|
||||||
MODEL_10B_128x3_66B,
|
MODEL_10B_128x3_66B,
|
||||||
|
MODEL_57B_A14B,
|
||||||
|
MODEL_9B,
|
||||||
|
MODEL_27B,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t kiB = 1024;
|
static const size_t kiB = 1024;
|
||||||
|
@ -2063,6 +2112,7 @@ struct llama_hparams {
|
||||||
uint32_t n_head_kv;
|
uint32_t n_head_kv;
|
||||||
uint32_t n_layer;
|
uint32_t n_layer;
|
||||||
uint32_t n_rot;
|
uint32_t n_rot;
|
||||||
|
uint32_t n_swa = 0; // sliding window attention (SWA)
|
||||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||||
uint32_t n_ff;
|
uint32_t n_ff;
|
||||||
|
@ -2083,6 +2133,9 @@ struct llama_hparams {
|
||||||
float f_norm_eps;
|
float f_norm_eps;
|
||||||
float f_norm_rms_eps;
|
float f_norm_rms_eps;
|
||||||
|
|
||||||
|
float f_attn_logit_softcapping = 50.0f;
|
||||||
|
float f_final_logit_softcapping = 30.0f;
|
||||||
|
|
||||||
float rope_attn_factor = 1.0f;
|
float rope_attn_factor = 1.0f;
|
||||||
float rope_freq_base_train;
|
float rope_freq_base_train;
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
|
@ -2099,8 +2152,9 @@ struct llama_hparams {
|
||||||
float f_max_alibi_bias = 0.0f;
|
float f_max_alibi_bias = 0.0f;
|
||||||
float f_logit_scale = 0.0f;
|
float f_logit_scale = 0.0f;
|
||||||
|
|
||||||
bool causal_attn = true;
|
bool causal_attn = true;
|
||||||
bool use_alibi = false;
|
bool use_alibi = false;
|
||||||
|
bool attn_soft_cap = false;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||||
|
@ -2115,6 +2169,7 @@ struct llama_hparams {
|
||||||
if (this->n_head_kv != other.n_head_kv) return true;
|
if (this->n_head_kv != other.n_head_kv) return true;
|
||||||
if (this->n_layer != other.n_layer) return true;
|
if (this->n_layer != other.n_layer) return true;
|
||||||
if (this->n_rot != other.n_rot) return true;
|
if (this->n_rot != other.n_rot) return true;
|
||||||
|
if (this->n_swa != other.n_swa) return true;
|
||||||
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
||||||
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
||||||
if (this->n_ff != other.n_ff) return true;
|
if (this->n_ff != other.n_ff) return true;
|
||||||
|
@ -2225,6 +2280,7 @@ struct llama_layer {
|
||||||
struct ggml_tensor * attn_q_a_norm;
|
struct ggml_tensor * attn_q_a_norm;
|
||||||
struct ggml_tensor * attn_kv_a_norm;
|
struct ggml_tensor * attn_kv_a_norm;
|
||||||
struct ggml_tensor * attn_sub_norm;
|
struct ggml_tensor * attn_sub_norm;
|
||||||
|
struct ggml_tensor * attn_post_norm;
|
||||||
struct ggml_tensor * ffn_sub_norm;
|
struct ggml_tensor * ffn_sub_norm;
|
||||||
struct ggml_tensor * cross_attn_norm;
|
struct ggml_tensor * cross_attn_norm;
|
||||||
struct ggml_tensor * enc_attn_norm;
|
struct ggml_tensor * enc_attn_norm;
|
||||||
|
@ -2263,6 +2319,7 @@ struct llama_layer {
|
||||||
// normalization
|
// normalization
|
||||||
struct ggml_tensor * ffn_norm;
|
struct ggml_tensor * ffn_norm;
|
||||||
struct ggml_tensor * ffn_norm_b;
|
struct ggml_tensor * ffn_norm_b;
|
||||||
|
struct ggml_tensor * ffn_post_norm;
|
||||||
struct ggml_tensor * layer_out_norm;
|
struct ggml_tensor * layer_out_norm;
|
||||||
struct ggml_tensor * layer_out_norm_b;
|
struct ggml_tensor * layer_out_norm_b;
|
||||||
struct ggml_tensor * ffn_norm_exps;
|
struct ggml_tensor * ffn_norm_exps;
|
||||||
|
@ -2651,17 +2708,18 @@ struct llama_context {
|
||||||
void * abort_callback_data = nullptr;
|
void * abort_callback_data = nullptr;
|
||||||
|
|
||||||
// input tensors
|
// input tensors
|
||||||
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
||||||
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
||||||
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||||
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
||||||
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
||||||
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
|
||||||
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
||||||
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
||||||
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
||||||
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
struct ggml_tensor * inp_s_copy; // I32 [kv_size]
|
||||||
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
|
||||||
|
struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
|
||||||
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
||||||
struct ggml_tensor * inp_enc_output; // F32 [n_embd, n_enc_outputs]
|
struct ggml_tensor * inp_enc_output; // F32 [n_embd, n_enc_outputs]
|
||||||
struct ggml_tensor * inp_cross_KQ_mask; // F32 [n_enc_outputs, n_batch]
|
struct ggml_tensor * inp_cross_KQ_mask; // F32 [n_enc_outputs, n_batch]
|
||||||
|
@ -4282,6 +4340,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
case MODEL_780M: return "780M";
|
case MODEL_780M: return "780M";
|
||||||
case MODEL_0_5B: return "0.5B";
|
case MODEL_0_5B: return "0.5B";
|
||||||
case MODEL_1B: return "1B";
|
case MODEL_1B: return "1B";
|
||||||
|
case MODEL_1_3B: return "1.3B";
|
||||||
case MODEL_1_4B: return "1.4B";
|
case MODEL_1_4B: return "1.4B";
|
||||||
case MODEL_2B: return "2B";
|
case MODEL_2B: return "2B";
|
||||||
case MODEL_2_8B: return "2.8B";
|
case MODEL_2_8B: return "2.8B";
|
||||||
|
@ -4314,6 +4373,9 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
case MODEL_8x22B: return "8x22B";
|
case MODEL_8x22B: return "8x22B";
|
||||||
case MODEL_16x12B: return "16x12B";
|
case MODEL_16x12B: return "16x12B";
|
||||||
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
||||||
|
case MODEL_57B_A14B: return "57B.A14B";
|
||||||
|
case MODEL_9B: return "9B";
|
||||||
|
case MODEL_27B: return "27B";
|
||||||
default: return "?B";
|
default: return "?B";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4635,6 +4697,7 @@ static void llm_load_hparams(
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 24: model.type = e_model::MODEL_A2_7B; break;
|
case 24: model.type = e_model::MODEL_A2_7B; break;
|
||||||
|
case 28: model.type = e_model::MODEL_57B_A14B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -4715,6 +4778,21 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA2:
|
||||||
|
{
|
||||||
|
hparams.n_swa = 4096; // default value of gemma 2
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
||||||
|
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||||
|
hparams.attn_soft_cap = true;
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 42: model.type = e_model::MODEL_9B; break;
|
||||||
|
case 46: model.type = e_model::MODEL_27B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
@ -4886,6 +4964,18 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_JAIS:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 24: model.type = e_model::MODEL_1_3B; break;
|
||||||
|
case 40: model.type = e_model::MODEL_13B; break;
|
||||||
|
/* TODO: add variants */
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_T5:
|
case LLM_ARCH_T5:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
@ -5145,6 +5235,12 @@ static void llm_load_vocab(
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "poro-chat") {
|
tokenizer_pre == "poro-chat") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "viking") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "jais") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -5230,10 +5326,10 @@ static void llm_load_vocab(
|
||||||
if (gen_name.find("code") != std::string::npos) {
|
if (gen_name.find("code") != std::string::npos) {
|
||||||
if (model.arch == LLM_ARCH_LLAMA
|
if (model.arch == LLM_ARCH_LLAMA
|
||||||
&& 32010 < vocab.id_to_token.size()
|
&& 32010 < vocab.id_to_token.size()
|
||||||
&& vocab.id_to_token[32007].text == "<PRE>"
|
&& vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
|
||||||
&& vocab.id_to_token[32008].text == "<SUF>"
|
&& vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
|
||||||
&& vocab.id_to_token[32009].text == "<MID>"
|
&& vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
|
||||||
&& vocab.id_to_token[32010].text == "<EOT>") {
|
&& vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
|
||||||
vocab.special_prefix_id = 32007;
|
vocab.special_prefix_id = 32007;
|
||||||
vocab.special_suffix_id = 32008;
|
vocab.special_suffix_id = 32008;
|
||||||
vocab.special_middle_id = 32009;
|
vocab.special_middle_id = 32009;
|
||||||
|
@ -5442,6 +5538,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
||||||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||||
|
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
||||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
||||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
||||||
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
||||||
|
@ -6584,6 +6681,40 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA2:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||||
|
|
||||||
|
const int64_t n_ff = hparams.n_ff;
|
||||||
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||||
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
||||||
|
layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
|
layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@ -6943,6 +7074,44 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_JAIS:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// Output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||||
|
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
||||||
|
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff});
|
||||||
|
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_T5:
|
case LLM_ARCH_T5:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@ -7636,6 +7805,12 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
kq = ggml_scale(ctx, kq, 30);
|
kq = ggml_scale(ctx, kq, 30);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (hparams.attn_soft_cap) {
|
||||||
|
kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
|
||||||
|
kq = ggml_tanh(ctx, kq);
|
||||||
|
kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
|
||||||
|
}
|
||||||
|
|
||||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
||||||
cb(kq, "kq_soft_max_ext", il);
|
cb(kq, "kq_soft_max_ext", il);
|
||||||
|
|
||||||
|
@ -7817,17 +7992,18 @@ struct llm_build_context {
|
||||||
|
|
||||||
ctx0 = ggml_init(params);
|
ctx0 = ggml_init(params);
|
||||||
|
|
||||||
lctx.inp_tokens = nullptr;
|
lctx.inp_tokens = nullptr;
|
||||||
lctx.inp_embd = nullptr;
|
lctx.inp_embd = nullptr;
|
||||||
lctx.inp_pos = nullptr;
|
lctx.inp_pos = nullptr;
|
||||||
lctx.inp_out_ids = nullptr;
|
lctx.inp_out_ids = nullptr;
|
||||||
lctx.inp_KQ_mask = nullptr;
|
lctx.inp_KQ_mask = nullptr;
|
||||||
lctx.inp_K_shift = nullptr;
|
lctx.inp_KQ_mask_swa = nullptr;
|
||||||
lctx.inp_mean = nullptr;
|
lctx.inp_K_shift = nullptr;
|
||||||
lctx.inp_cls = nullptr;
|
lctx.inp_mean = nullptr;
|
||||||
lctx.inp_s_copy = nullptr;
|
lctx.inp_cls = nullptr;
|
||||||
lctx.inp_s_mask = nullptr;
|
lctx.inp_s_copy = nullptr;
|
||||||
lctx.inp_s_seq = nullptr;
|
lctx.inp_s_mask = nullptr;
|
||||||
|
lctx.inp_s_seq = nullptr;
|
||||||
lctx.inp_pos_bucket = nullptr;
|
lctx.inp_pos_bucket = nullptr;
|
||||||
lctx.inp_enc_output = nullptr;
|
lctx.inp_enc_output = nullptr;
|
||||||
lctx.inp_cross_KQ_mask = nullptr;
|
lctx.inp_cross_KQ_mask = nullptr;
|
||||||
|
@ -7849,7 +8025,6 @@ struct llm_build_context {
|
||||||
cb(lctx.inp_K_shift, "K_shift", -1);
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
||||||
ggml_set_input(lctx.inp_K_shift);
|
ggml_set_input(lctx.inp_K_shift);
|
||||||
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||||
struct ggml_tensor * tmp =
|
struct ggml_tensor * tmp =
|
||||||
|
@ -7984,16 +8159,27 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
|
||||||
if (causal) {
|
lctx.inp_KQ_mask = causal
|
||||||
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
|
||||||
} else {
|
: ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||||
lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
|
||||||
}
|
|
||||||
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
cb(lctx.inp_KQ_mask, "KQ_mask", -1);
|
||||||
ggml_set_input(lctx.inp_KQ_mask);
|
ggml_set_input(lctx.inp_KQ_mask);
|
||||||
|
|
||||||
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) {
|
||||||
|
GGML_ASSERT(hparams.n_swa > 0);
|
||||||
|
|
||||||
|
lctx.inp_KQ_mask_swa = causal
|
||||||
|
? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
|
||||||
|
: ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||||
|
cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1);
|
||||||
|
ggml_set_input(lctx.inp_KQ_mask_swa);
|
||||||
|
|
||||||
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * build_inp_mean() {
|
struct ggml_tensor * build_inp_mean() {
|
||||||
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
||||||
cb(lctx.inp_mean, "inp_mean", -1);
|
cb(lctx.inp_mean, "inp_mean", -1);
|
||||||
|
@ -11104,6 +11290,136 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_gemma2() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||||
|
cb(inpL, "inp_scaled", -1);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
// gemma 2 requires different mask for layers using sliding window (SWA)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
|
||||||
|
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
// (il % 2) layers use SWA
|
||||||
|
struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head)));
|
||||||
|
cb(Qcur, "Qcur_scaled", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||||
|
model.layers[il].wo, NULL,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].attn_post_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_post_norm", il);
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||||
|
cb(sa_out, "sa_out", il);
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
{
|
||||||
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.layers[il].ffn_post_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "ffn_post_norm", -1);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, sa_out);
|
||||||
|
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
|
||||||
|
// final logit soft-capping
|
||||||
|
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
||||||
|
cur = ggml_tanh(ctx0, cur);
|
||||||
|
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
struct ggml_cgraph * build_starcoder2() {
|
struct ggml_cgraph * build_starcoder2() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -12297,6 +12613,97 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_jais() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm,
|
||||||
|
model.layers[il].attn_norm_b,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||||
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||||
|
cb(cur, "bqkv", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
|
||||||
|
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
|
||||||
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add the input
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// FF
|
||||||
|
{
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm,
|
||||||
|
model.layers[il].ffn_norm_b,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
||||||
|
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
||||||
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(inpL, "l_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.output_norm,
|
||||||
|
model.output_norm_b,
|
||||||
|
LLM_NORM, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
struct ggml_cgraph * build_t5() {
|
struct ggml_cgraph * build_t5() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@ -12798,6 +13205,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_gemma();
|
result = llm.build_gemma();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA2:
|
||||||
|
{
|
||||||
|
result = llm.build_gemma2();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
{
|
{
|
||||||
result = llm.build_starcoder2();
|
result = llm.build_starcoder2();
|
||||||
|
@ -12838,6 +13249,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_bitnet();
|
result = llm.build_bitnet();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_JAIS:
|
||||||
|
{
|
||||||
|
result = llm.build_jais();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_T5:
|
case LLM_ARCH_T5:
|
||||||
{
|
{
|
||||||
result = llm.build_t5();
|
result = llm.build_t5();
|
||||||
|
@ -12975,7 +13390,12 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
||||||
|
|
||||||
float * data = (float *) lctx.inp_KQ_mask->data;
|
float * data = (float *) lctx.inp_KQ_mask->data;
|
||||||
|
float * data_swa = nullptr;
|
||||||
|
|
||||||
|
if (lctx.inp_KQ_mask_swa) {
|
||||||
|
data_swa = (float *) lctx.inp_KQ_mask_swa->data;
|
||||||
|
}
|
||||||
|
|
||||||
// For causal attention, use only the previous KV cells
|
// For causal attention, use only the previous KV cells
|
||||||
// of the correct sequence for each token of the batch.
|
// of the correct sequence for each token of the batch.
|
||||||
|
@ -12997,6 +13417,14 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
||||||
|
|
||||||
|
// may need to cut off old tokens for sliding window
|
||||||
|
if (data_swa) {
|
||||||
|
if (pos - lctx.kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
|
||||||
|
f = -INFINITY;
|
||||||
|
}
|
||||||
|
data_swa[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14451,6 +14879,7 @@ struct llm_tokenizer_bpe {
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
||||||
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_JAIS:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
};
|
};
|
||||||
|
@ -14468,6 +14897,12 @@ struct llm_tokenizer_bpe {
|
||||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_VIKING:
|
||||||
|
regex_exprs = {
|
||||||
|
"\\p{N}",
|
||||||
|
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||||
|
};
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
|
@ -17944,6 +18379,12 @@ struct llama_context * llama_new_context_with_model(
|
||||||
params.flash_attn = false;
|
params.flash_attn = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.flash_attn && model->hparams.attn_soft_cap) {
|
||||||
|
LLAMA_LOG_WARN("%s: flash_attn is not compatible with attn_soft_cap - forcing off\n", __func__);
|
||||||
|
params.flash_attn = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
|
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
|
||||||
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
|
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
|
||||||
params.flash_attn = false;
|
params.flash_attn = false;
|
||||||
|
@ -18325,6 +18766,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_MAMBA:
|
case LLM_ARCH_MAMBA:
|
||||||
case LLM_ARCH_JINA_BERT_V2:
|
case LLM_ARCH_JINA_BERT_V2:
|
||||||
case LLM_ARCH_T5:
|
case LLM_ARCH_T5:
|
||||||
|
case LLM_ARCH_JAIS:
|
||||||
return LLAMA_ROPE_TYPE_NONE;
|
return LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||||
|
@ -18357,6 +18799,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
case LLM_ARCH_PHI3:
|
case LLM_ARCH_PHI3:
|
||||||
case LLM_ARCH_GEMMA:
|
case LLM_ARCH_GEMMA:
|
||||||
|
case LLM_ARCH_GEMMA2:
|
||||||
case LLM_ARCH_STARCODER2:
|
case LLM_ARCH_STARCODER2:
|
||||||
case LLM_ARCH_GPTNEOX:
|
case LLM_ARCH_GPTNEOX:
|
||||||
return LLAMA_ROPE_TYPE_NEOX;
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
@ -20199,7 +20642,10 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
std::string & dest, bool add_ass) {
|
std::string & dest, bool add_ass) {
|
||||||
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
if (tmpl == "chatml" || tmpl.find("<|im_start|>") != std::string::npos) {
|
auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
|
||||||
|
return tmpl.find(haystack) != std::string::npos;
|
||||||
|
};
|
||||||
|
if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
|
||||||
// chatml template
|
// chatml template
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
||||||
|
@ -20207,16 +20653,16 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_start|>assistant\n";
|
ss << "<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl.find("[INST]") != std::string::npos) {
|
} else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
|
||||||
// llama2 template and its variants
|
// llama2 template and its variants
|
||||||
// [variant] support system message
|
// [variant] support system message
|
||||||
bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos || tmpl == "mistral";
|
bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
|
||||||
// [variant] space before + after response
|
// [variant] space before + after response
|
||||||
bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
|
bool space_around_response = tmpl_contains("' ' + eos_token");
|
||||||
// [variant] add BOS inside history
|
// [variant] add BOS inside history
|
||||||
bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
|
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
|
||||||
// [variant] trim spaces from the input message
|
// [variant] trim spaces from the input message
|
||||||
bool strip_message = tmpl.find("content.strip()") != std::string::npos;
|
bool strip_message = tmpl_contains("content.strip()");
|
||||||
// construct the prompt
|
// construct the prompt
|
||||||
bool is_inside_turn = true; // skip BOS at the beginning
|
bool is_inside_turn = true; // skip BOS at the beginning
|
||||||
ss << "[INST] ";
|
ss << "[INST] ";
|
||||||
|
@ -20242,7 +20688,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// llama2 templates seem to not care about "add_generation_prompt"
|
// llama2 templates seem to not care about "add_generation_prompt"
|
||||||
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
} else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
|
||||||
// Phi 3
|
// Phi 3
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
|
@ -20251,7 +20697,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>\n";
|
ss << "<|assistant|>\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
} else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
|
||||||
// zephyr template
|
// zephyr template
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
||||||
|
@ -20259,7 +20705,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>\n";
|
ss << "<|assistant|>\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "monarch" || tmpl.find("bos_token + message['role']") != std::string::npos) {
|
} else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
|
||||||
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
||||||
|
@ -20268,7 +20714,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<s>assistant\n";
|
ss << "<s>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
} else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
|
||||||
// google/gemma-7b-it
|
// google/gemma-7b-it
|
||||||
std::string system_prompt = "";
|
std::string system_prompt = "";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
|
@ -20290,7 +20736,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<start_of_turn>model\n";
|
ss << "<start_of_turn>model\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
|
} else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
|
||||||
// OrionStarAI/Orion-14B-Chat
|
// OrionStarAI/Orion-14B-Chat
|
||||||
std::string system_prompt = "";
|
std::string system_prompt = "";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
|
@ -20310,7 +20756,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
ss << message->content << "</s>";
|
ss << message->content << "</s>";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) {
|
} else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
|
||||||
// openchat/openchat-3.5-0106,
|
// openchat/openchat-3.5-0106,
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
|
@ -20324,13 +20770,13 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "GPT4 Correct Assistant:";
|
ss << "GPT4 Correct Assistant:";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) {
|
} else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
|
||||||
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
// Orca-Vicuna variant uses a system prefix
|
// Orca-Vicuna variant uses a system prefix
|
||||||
if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) {
|
if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
|
||||||
ss << "SYSTEM: " << message->content << "\n";
|
ss << "SYSTEM: " << message->content << "\n";
|
||||||
} else {
|
} else {
|
||||||
ss << message->content << "\n\n";
|
ss << message->content << "\n\n";
|
||||||
|
@ -20344,7 +20790,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "ASSISTANT:";
|
ss << "ASSISTANT:";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) {
|
} else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
|
||||||
// deepseek-ai/deepseek-coder-33b-instruct
|
// deepseek-ai/deepseek-coder-33b-instruct
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
|
@ -20359,7 +20805,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "### Response:\n";
|
ss << "### Response:\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) {
|
} else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
|
||||||
// CohereForAI/c4ai-command-r-plus
|
// CohereForAI/c4ai-command-r-plus
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
|
@ -20374,7 +20820,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
||||||
}
|
}
|
||||||
} else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) {
|
} else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
|
||||||
// Llama 3
|
// Llama 3
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
|
@ -20383,6 +20829,33 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
|
||||||
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "user") {
|
||||||
|
ss << u8"<用户>";
|
||||||
|
ss << trim(message->content);
|
||||||
|
ss << "<AI>";
|
||||||
|
} else {
|
||||||
|
ss << trim(message->content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
||||||
|
// DeepSeek-V2
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "system") {
|
||||||
|
ss << message->content << "\n\n";
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << "User: " << message->content << "\n\n";
|
||||||
|
} else if (role == "assistant") {
|
||||||
|
ss << "Assistant: " << message->content << u8"<|end▁of▁sentence|>";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "Assistant:";
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
|
|
|
@ -7030,4 +7030,3 @@ const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd
|
||||||
{0x02FA1C, 0x02FA1C, 0x009F3B},
|
{0x02FA1C, 0x02FA1C, 0x009F3B},
|
||||||
{0x02FA1D, 0x02FA1D, 0x02A600},
|
{0x02FA1D, 0x02FA1D, 0x02A600},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -2052,6 +2052,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
|
||||||
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
|
||||||
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
|
||||||
|
GGML_TYPE_BF16,
|
||||||
};
|
};
|
||||||
|
|
||||||
// unary ops
|
// unary ops
|
||||||
|
|
|
@ -57,7 +57,11 @@ int main(void) {
|
||||||
//Phi-3-medium
|
//Phi-3-medium
|
||||||
"{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
"{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
||||||
//Phi-3-vision
|
//Phi-3-vision
|
||||||
"{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
|
"{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
|
||||||
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||||
|
u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
|
||||||
|
// DeepSeek-V2
|
||||||
|
"{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
|
||||||
};
|
};
|
||||||
std::vector<std::string> expected_output = {
|
std::vector<std::string> expected_output = {
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
|
@ -94,6 +98,10 @@ int main(void) {
|
||||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
//Phi-3-vision
|
//Phi-3-vision
|
||||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||||
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||||
|
u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
|
||||||
|
// DeepSeek-V2
|
||||||
|
u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<|end▁of▁sentence|>User: Who are you\n\nAssistant: I am an assistant <|end▁of▁sentence|>User: Another question\n\nAssistant:",
|
||||||
};
|
};
|
||||||
std::vector<char> formatted_chat(1024);
|
std::vector<char> formatted_chat(1024);
|
||||||
int32_t res;
|
int32_t res;
|
||||||
|
@ -134,9 +142,9 @@ int main(void) {
|
||||||
std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
|
std::cout << "fmt_single(" << tmpl << ")\n" << output << "\n-------------------------\n";
|
||||||
return output;
|
return output;
|
||||||
};
|
};
|
||||||
assert(fmt_single("chatml") == "<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
|
assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
|
||||||
assert(fmt_single("llama2") == "[INST] How are you [/INST]");
|
assert(fmt_single("llama2") == "[INST] How are you [/INST]");
|
||||||
assert(fmt_single("gemma") == "<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
|
assert(fmt_single("gemma") == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
|
||||||
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
|
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -993,6 +993,40 @@ static void test_json_schema() {
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"simple pattern",
|
||||||
|
// Schema
|
||||||
|
R"""({
|
||||||
|
"pattern": "^[a-zA-Z0-9_-]*$"
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""("")""",
|
||||||
|
R"""("He_llo-12")""",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
R"""("!")""",
|
||||||
|
R"""("Hello World")""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
test_schema(
|
||||||
|
"pattern with escapes",
|
||||||
|
// Schema
|
||||||
|
R"""({
|
||||||
|
"pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
|
||||||
|
})""",
|
||||||
|
// Passing strings
|
||||||
|
{
|
||||||
|
R"""("a^$.[]()|{}*+?b")""",
|
||||||
|
},
|
||||||
|
// Failing strings
|
||||||
|
{
|
||||||
|
R"""("ab")""",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
test_schema(
|
test_schema(
|
||||||
"",
|
"",
|
||||||
// Schema
|
// Schema
|
||||||
|
@ -1062,8 +1096,6 @@ static void test_json_schema() {
|
||||||
R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
|
R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
|
||||||
// "By extension, even an empty object is valid"
|
// "By extension, even an empty object is valid"
|
||||||
R"""({})""",
|
R"""({})""",
|
||||||
// "By default, providing additional properties is valid"
|
|
||||||
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
|
|
||||||
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
|
||||||
},
|
},
|
||||||
// Failing strings
|
// Failing strings
|
||||||
|
@ -1074,6 +1106,9 @@ static void test_json_schema() {
|
||||||
R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
|
R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
|
||||||
// Reorder properties
|
// Reorder properties
|
||||||
R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
|
||||||
|
// "Additional properties default to false for generation, even though the spec says true.
|
||||||
|
R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
|
||||||
|
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
@ -1120,28 +1120,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||||
R"""(
|
R"""(
|
||||||
alternative-0 ::= foo
|
alternative-0 ::= foo
|
||||||
alternative-1 ::= bar
|
alternative-1 ::= bar
|
||||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
bar ::= "{" space (bar-b-kv )? "}" space
|
||||||
bar ::= "{" space (bar-b-kv bar-b-rest | bar-additional-kv ( "," space bar-additional-kv )* )? "}" space
|
|
||||||
bar-additional-k ::= ["] ( [b] char+ | [^"b] char* )? ["] space
|
|
||||||
bar-additional-kv ::= bar-additional-k ":" space value
|
|
||||||
bar-b-kv ::= "\"b\"" space ":" space number
|
bar-b-kv ::= "\"b\"" space ":" space number
|
||||||
bar-b-rest ::= ( "," space bar-additional-kv )*
|
|
||||||
boolean ::= ("true" | "false") space
|
|
||||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
|
||||||
decimal-part ::= [0-9]{1,16}
|
decimal-part ::= [0-9]{1,16}
|
||||||
foo ::= "{" space (foo-a-kv foo-a-rest | foo-additional-kv ( "," space foo-additional-kv )* )? "}" space
|
foo ::= "{" space (foo-a-kv )? "}" space
|
||||||
foo-a-kv ::= "\"a\"" space ":" space number
|
foo-a-kv ::= "\"a\"" space ":" space number
|
||||||
foo-a-rest ::= ( "," space foo-additional-kv )*
|
|
||||||
foo-additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
|
|
||||||
foo-additional-kv ::= foo-additional-k ":" space value
|
|
||||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||||
null ::= "null" space
|
|
||||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
|
||||||
root ::= alternative-0 | alternative-1
|
root ::= alternative-0 | alternative-1
|
||||||
space ::= | " " | "\n" [ \t]{0,20}
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
string ::= "\"" char* "\"" space
|
|
||||||
value ::= object | array | string | number | boolean | null
|
|
||||||
)"""
|
)"""
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1177,25 +1164,15 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
|
||||||
})""",
|
})""",
|
||||||
R"""(
|
R"""(
|
||||||
a-kv ::= "\"a\"" space ":" space number
|
a-kv ::= "\"a\"" space ":" space number
|
||||||
additional-k ::= ["] ( [a] char+ | [b] char+ | [c] char+ | [d] char+ | [^"abcd] char* )? ["] space
|
|
||||||
additional-kv ::= additional-k ":" space value
|
|
||||||
array ::= "[" space ( value ("," space value)* )? "]" space
|
|
||||||
b-kv ::= "\"b\"" space ":" space number
|
b-kv ::= "\"b\"" space ":" space number
|
||||||
boolean ::= ("true" | "false") space
|
|
||||||
c-kv ::= "\"c\"" space ":" space number
|
c-kv ::= "\"c\"" space ":" space number
|
||||||
c-rest ::= ( "," space additional-kv )*
|
|
||||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
|
|
||||||
d-kv ::= "\"d\"" space ":" space number
|
d-kv ::= "\"d\"" space ":" space number
|
||||||
d-rest ::= ( "," space c-kv )? c-rest
|
d-rest ::= ( "," space c-kv )?
|
||||||
decimal-part ::= [0-9]{1,16}
|
decimal-part ::= [0-9]{1,16}
|
||||||
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
integral-part ::= [0] | [1-9] [0-9]{0,15}
|
||||||
null ::= "null" space
|
|
||||||
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
|
||||||
object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
|
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
|
||||||
root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv c-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
|
|
||||||
space ::= | " " | "\n" [ \t]{0,20}
|
space ::= | " " | "\n" [ \t]{0,20}
|
||||||
string ::= "\"" char* "\"" space
|
|
||||||
value ::= object | array | string | number | boolean | null
|
|
||||||
)"""
|
)"""
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -1262,26 +1239,30 @@ int main() {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
|
if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
|
||||||
test_all("Python", [](const TestCase & tc) {
|
fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
|
||||||
write("test-json-schema-input.tmp", tc.schema);
|
|
||||||
tc.verify_status(std::system(
|
|
||||||
"python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
|
||||||
tc.verify(read("test-grammar-output.tmp"));
|
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
|
if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
|
||||||
}
|
test_all("Python", [](const TestCase & tc) {
|
||||||
|
write("test-json-schema-input.tmp", tc.schema);
|
||||||
|
tc.verify_status(std::system(
|
||||||
|
"python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
||||||
|
tc.verify(read("test-grammar-output.tmp"));
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
|
||||||
|
}
|
||||||
|
|
||||||
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
|
if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
|
||||||
test_all("JavaScript", [](const TestCase & tc) {
|
test_all("JavaScript", [](const TestCase & tc) {
|
||||||
write("test-json-schema-input.tmp", tc.schema);
|
write("test-json-schema-input.tmp", tc.schema);
|
||||||
tc.verify_status(std::system(
|
tc.verify_status(std::system(
|
||||||
"node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
"node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
|
||||||
tc.verify(read("test-grammar-output.tmp"));
|
tc.verify(read("test-grammar-output.tmp"));
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
|
fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
test_all("Check Expectations Validity", [](const TestCase & tc) {
|
test_all("Check Expectations Validity", [](const TestCase & tc) {
|
||||||
|
|
|
@ -218,4 +218,3 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue