Merge branch 'master' into vulkan
This commit is contained in:
commit
64c16c4ae0
85 changed files with 12150 additions and 808 deletions
|
@ -8,11 +8,11 @@ arg1="$1"
|
||||||
shift
|
shift
|
||||||
|
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
python3 ./convert_hf_to_gguf.py "$@"
|
exec python3 ./convert_hf_to_gguf.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./llama-quantize "$@"
|
exec ./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./llama-cli "$@"
|
exec ./llama-cli "$@"
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -20,11 +20,11 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
||||||
else
|
else
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
||||||
./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
./llama-server "$@"
|
exec ./llama-server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
|
|
26
.github/workflows/build.yml
vendored
26
.github/workflows/build.yml
vendored
|
@ -662,6 +662,8 @@ jobs:
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
- build: 'llvm-arm64-opencl-adreno'
|
||||||
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -703,6 +705,28 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
choco install ninja
|
choco install ninja
|
||||||
|
|
||||||
|
- name: Install OpenCL Headers and Libs
|
||||||
|
id: install_opencl
|
||||||
|
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
|
||||||
|
run: |
|
||||||
|
git clone https://github.com/KhronosGroup/OpenCL-Headers
|
||||||
|
cd OpenCL-Headers
|
||||||
|
mkdir build && cd build
|
||||||
|
cmake .. `
|
||||||
|
-DBUILD_TESTING=OFF `
|
||||||
|
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
||||||
|
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
||||||
|
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||||
|
cmake --build . --target install
|
||||||
|
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
|
||||||
|
cd OpenCL-ICD-Loader
|
||||||
|
mkdir build-arm64-release && cd build-arm64-release
|
||||||
|
cmake .. `
|
||||||
|
-A arm64 `
|
||||||
|
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
|
||||||
|
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
|
||||||
|
cmake --build . --target install --config release
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
@ -732,7 +756,7 @@ jobs:
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
# not all machines have native AVX-512
|
# not all machines have native AVX-512
|
||||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -L main -C Release --verbose --timeout 900
|
ctest -L main -C Release --verbose --timeout 900
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
|
||||||
|
|
||||||
ci/ @ggerganov
|
/ci/ @ggerganov
|
||||||
|
/.devops/ @ngxson
|
||||||
|
/examples/server/ @ngxson
|
||||||
|
|
9
Makefile
9
Makefile
|
@ -22,6 +22,7 @@ BUILD_TARGETS = \
|
||||||
llama-infill \
|
llama-infill \
|
||||||
llama-llava-cli \
|
llama-llava-cli \
|
||||||
llama-minicpmv-cli\
|
llama-minicpmv-cli\
|
||||||
|
llama-qwen2vl-cli\
|
||||||
llama-lookahead \
|
llama-lookahead \
|
||||||
llama-lookup \
|
llama-lookup \
|
||||||
llama-lookup-create \
|
llama-lookup-create \
|
||||||
|
@ -1404,6 +1405,14 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||||
|
|
||||||
|
llama-qwen2vl-cli: examples/llava/qwen2vl-cli.cpp \
|
||||||
|
examples/llava/llava.cpp \
|
||||||
|
examples/llava/llava.h \
|
||||||
|
examples/llava/clip.cpp \
|
||||||
|
examples/llava/clip.h \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
|
||||||
|
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
swift: examples/batched.swift
|
swift: examples/batched.swift
|
||||||
(cd examples/batched.swift; make build)
|
(cd examples/batched.swift; make build)
|
||||||
|
|
15
README.md
15
README.md
|
@ -110,6 +110,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||||
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||||
|
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
@ -433,6 +434,20 @@ To learn more about model quantization, [read this documentation](examples/quant
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## [`llama-run`](examples/run)
|
||||||
|
|
||||||
|
#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].
|
||||||
|
|
||||||
|
- <details>
|
||||||
|
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-run granite-code
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
[^3]: [https://github.com/containers/ramalama](RamaLama)
|
||||||
|
|
||||||
## [`llama-simple`](examples/simple)
|
## [`llama-simple`](examples/simple)
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||||
# Use curl to download model url
|
# Use curl to download model url
|
||||||
if (LLAMA_CURL)
|
if (LLAMA_CURL)
|
||||||
find_package(CURL REQUIRED)
|
find_package(CURL REQUIRED)
|
||||||
add_definitions(-DLLAMA_USE_CURL)
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
|
||||||
include_directories(${CURL_INCLUDE_DIRS})
|
include_directories(${CURL_INCLUDE_DIRS})
|
||||||
find_library(CURL_LIBRARY curl REQUIRED)
|
find_library(CURL_LIBRARY curl REQUIRED)
|
||||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||||
|
|
|
@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const std::vector<ggml_type> kv_cache_types = {
|
||||||
|
GGML_TYPE_F32,
|
||||||
|
GGML_TYPE_F16,
|
||||||
|
GGML_TYPE_BF16,
|
||||||
|
GGML_TYPE_Q8_0,
|
||||||
|
GGML_TYPE_Q4_0,
|
||||||
|
GGML_TYPE_Q4_1,
|
||||||
|
GGML_TYPE_IQ4_NL,
|
||||||
|
GGML_TYPE_Q5_0,
|
||||||
|
GGML_TYPE_Q5_1,
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||||
|
for (const auto & type : kv_cache_types) {
|
||||||
|
if (ggml_type_name(type) == s) {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw std::runtime_error("Unsupported cache type: " + s);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string get_all_kv_cache_types() {
|
||||||
|
std::ostringstream msg;
|
||||||
|
for (const auto & type : kv_cache_types) {
|
||||||
|
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
|
||||||
|
}
|
||||||
|
return msg.str();
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing functions
|
// CLI argument parsing functions
|
||||||
//
|
//
|
||||||
|
@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||||
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
string_format(
|
||||||
|
"KV cache data type for K\n"
|
||||||
|
"allowed values: %s\n"
|
||||||
|
"(default: %s)",
|
||||||
|
get_all_kv_cache_types().c_str(),
|
||||||
|
ggml_type_name(params.cache_type_k)
|
||||||
|
),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
// TODO: get the type right here
|
params.cache_type_k = kv_cache_type_from_str(value);
|
||||||
params.cache_type_k = value;
|
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctv", "--cache-type-v"}, "TYPE",
|
{"-ctv", "--cache-type-v"}, "TYPE",
|
||||||
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
string_format(
|
||||||
|
"KV cache data type for V\n"
|
||||||
|
"allowed values: %s\n"
|
||||||
|
"(default: %s)",
|
||||||
|
get_all_kv_cache_types().c_str(),
|
||||||
|
ggml_type_name(params.cache_type_v)
|
||||||
|
),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
// TODO: get the type right here
|
params.cache_type_v = kv_cache_type_from_str(value);
|
||||||
params.cache_type_v = value;
|
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
@ -2083,35 +2122,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_max = value;
|
params.speculative.n_max = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-min", "--draft-n-min"}, "N",
|
{"--draft-min", "--draft-n-min"}, "N",
|
||||||
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_min = value;
|
params.speculative.n_min = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-p-split"}, "P",
|
{"--draft-p-split"}, "P",
|
||||||
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.p_split = std::stof(value);
|
params.speculative.p_split = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-p-min"}, "P",
|
{"--draft-p-min"}, "P",
|
||||||
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.p_min = std::stof(value);
|
params.speculative.p_min = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cd", "--ctx-size-draft"}, "N",
|
{"-cd", "--ctx-size-draft"}, "N",
|
||||||
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_ctx = value;
|
params.speculative.n_ctx = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
||||||
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
||||||
|
@ -2131,14 +2170,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-md", "--model-draft"}, "FNAME",
|
{"-md", "--model-draft"}, "FNAME",
|
||||||
"draft model for speculative decoding (default: unused)",
|
"draft model for speculative decoding (default: unused)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.speculative.model = value;
|
params.speculative.model = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||||
return mparams;
|
return mparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
||||||
if (s == "f32") {
|
|
||||||
return GGML_TYPE_F32;
|
|
||||||
}
|
|
||||||
if (s == "f16") {
|
|
||||||
return GGML_TYPE_F16;
|
|
||||||
}
|
|
||||||
if (s == "bf16") {
|
|
||||||
return GGML_TYPE_BF16;
|
|
||||||
}
|
|
||||||
if (s == "q8_0") {
|
|
||||||
return GGML_TYPE_Q8_0;
|
|
||||||
}
|
|
||||||
if (s == "q4_0") {
|
|
||||||
return GGML_TYPE_Q4_0;
|
|
||||||
}
|
|
||||||
if (s == "q4_1") {
|
|
||||||
return GGML_TYPE_Q4_1;
|
|
||||||
}
|
|
||||||
if (s == "iq4_nl") {
|
|
||||||
return GGML_TYPE_IQ4_NL;
|
|
||||||
}
|
|
||||||
if (s == "q5_0") {
|
|
||||||
return GGML_TYPE_Q5_0;
|
|
||||||
}
|
|
||||||
if (s == "q5_1") {
|
|
||||||
return GGML_TYPE_Q5_1;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw std::runtime_error("Unsupported cache type: " + s);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
|
|
||||||
|
@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||||
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
||||||
}
|
}
|
||||||
|
|
||||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
cparams.type_k = params.cache_type_k;
|
||||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
cparams.type_v = params.cache_type_v;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
@ -1108,12 +1076,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||||
#define CURL_MAX_RETRY 3
|
#define CURL_MAX_RETRY 3
|
||||||
#define CURL_RETRY_DELAY_SECONDS 2
|
#define CURL_RETRY_DELAY_SECONDS 2
|
||||||
|
|
||||||
|
|
||||||
static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
||||||
// While we wait for C++20's std::string::starts_with...
|
|
||||||
return str.rfind(prefix, 0) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
||||||
int remaining_attempts = max_attempts;
|
int remaining_attempts = max_attempts;
|
||||||
|
|
||||||
|
|
|
@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;
|
||||||
|
|
||||||
// build info
|
// build info
|
||||||
extern int LLAMA_BUILD_NUMBER;
|
extern int LLAMA_BUILD_NUMBER;
|
||||||
extern char const * LLAMA_COMMIT;
|
extern const char * LLAMA_COMMIT;
|
||||||
extern char const * LLAMA_COMPILER;
|
extern const char * LLAMA_COMPILER;
|
||||||
extern char const * LLAMA_BUILD_TARGET;
|
extern const char * LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
struct common_control_vector_load_info;
|
struct common_control_vector_load_info;
|
||||||
|
|
||||||
|
@ -286,8 +286,8 @@ struct common_params {
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
bool check_tensors = false; // validate tensor data
|
bool check_tensors = false; // validate tensor data
|
||||||
|
|
||||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||||
|
@ -437,6 +437,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
|
||||||
return parts;
|
return parts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool string_starts_with(const std::string & str,
|
||||||
|
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
|
||||||
|
return str.rfind(prefix, 0) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
void string_process_escapes(std::string & input);
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
|
|
|
@ -2001,6 +2001,29 @@ class Qwen2Model(Model):
|
||||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Qwen2VLForConditionalGeneration")
|
||||||
|
class Qwen2VLModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.QWEN2VL
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
mrope_section = self.hparams["rope_scaling"]["mrope_section"]
|
||||||
|
mrope_section += [0] * max(0, 4 - len(mrope_section))
|
||||||
|
self.gguf_writer.add_rope_dimension_sections(mrope_section)
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
try:
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
except FileNotFoundError:
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
|
for name, data in super().get_tensors():
|
||||||
|
if name.startswith("visual."):
|
||||||
|
continue
|
||||||
|
yield name, data
|
||||||
|
|
||||||
|
|
||||||
@Model.register("Qwen2MoeForCausalLM")
|
@Model.register("Qwen2MoeForCausalLM")
|
||||||
class Qwen2MoeModel(Model):
|
class Qwen2MoeModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN2MOE
|
model_arch = gguf.MODEL_ARCH.QWEN2MOE
|
||||||
|
|
|
@ -20,7 +20,12 @@ else()
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
|
||||||
|
if (NOT WIN32)
|
||||||
|
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
||||||
add_subdirectory(gbnf-validator)
|
add_subdirectory(gbnf-validator)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_subdirectory(gguf-hash)
|
add_subdirectory(gguf-hash)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
add_subdirectory(gguf)
|
add_subdirectory(gguf)
|
||||||
|
@ -46,12 +51,16 @@ else()
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(speculative-simple)
|
add_subdirectory(speculative-simple)
|
||||||
add_subdirectory(tokenize)
|
add_subdirectory(tokenize)
|
||||||
|
add_subdirectory(gen-docs)
|
||||||
if (NOT GGML_BACKEND_DL)
|
if (NOT GGML_BACKEND_DL)
|
||||||
# these examples use the backends directly and cannot be built with dynamic loading
|
# these examples use the backends directly and cannot be built with dynamic loading
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(cvector-generator)
|
add_subdirectory(cvector-generator)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(export-lora)
|
||||||
|
if (NOT WIN32)
|
||||||
|
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
||||||
add_subdirectory(quantize-stats)
|
add_subdirectory(quantize-stats)
|
||||||
|
endif()
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
if (GGML_RPC)
|
if (GGML_RPC)
|
||||||
add_subdirectory(rpc)
|
add_subdirectory(rpc)
|
||||||
|
|
|
@ -287,7 +287,7 @@ struct split_strategy {
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_info() {
|
void print_info() {
|
||||||
printf("n_split: %ld\n", ctx_outs.size());
|
printf("n_split: %zu\n", ctx_outs.size());
|
||||||
int i_split = 0;
|
int i_split = 0;
|
||||||
for (auto & ctx_out : ctx_outs) {
|
for (auto & ctx_out : ctx_outs) {
|
||||||
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
|
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
|
||||||
|
@ -297,7 +297,7 @@ struct split_strategy {
|
||||||
total_size += ggml_nbytes(t);
|
total_size += ggml_nbytes(t);
|
||||||
}
|
}
|
||||||
total_size = total_size / 1000 / 1000; // convert to megabytes
|
total_size = total_size / 1000 / 1000; // convert to megabytes
|
||||||
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
|
||||||
i_split++;
|
i_split++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
|
||||||
for (const auto & inst : params_instances) {
|
for (const auto & inst : params_instances) {
|
||||||
params_idx++;
|
params_idx++;
|
||||||
if (params.progress) {
|
if (params.progress) {
|
||||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
|
||||||
}
|
}
|
||||||
// keep the same model between tests when possible
|
// keep the same model between tests when possible
|
||||||
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
||||||
|
@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
|
||||||
// warmup run
|
// warmup run
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
if (params.progress) {
|
if (params.progress) {
|
||||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
|
||||||
}
|
}
|
||||||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||||
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
||||||
}
|
}
|
||||||
if (t.n_gen > 0) {
|
if (t.n_gen > 0) {
|
||||||
if (params.progress) {
|
if (params.progress) {
|
||||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
|
||||||
}
|
}
|
||||||
test_gen(ctx, 1, t.n_threads);
|
test_gen(ctx, 1, t.n_threads);
|
||||||
}
|
}
|
||||||
|
@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
if (params.progress) {
|
if (params.progress) {
|
||||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
|
||||||
i + 1, params.reps);
|
i + 1, params.reps);
|
||||||
}
|
}
|
||||||
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
||||||
}
|
}
|
||||||
if (t.n_gen > 0) {
|
if (t.n_gen > 0) {
|
||||||
if (params.progress) {
|
if (params.progress) {
|
||||||
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
|
||||||
i + 1, params.reps);
|
i + 1, params.reps);
|
||||||
}
|
}
|
||||||
test_gen(ctx, t.n_gen, t.n_threads);
|
test_gen(ctx, t.n_gen, t.n_threads);
|
||||||
|
|
|
@ -43,3 +43,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
||||||
|
set(TARGET llama-qwen2vl-cli)
|
||||||
|
add_executable(${TARGET} qwen2vl-cli.cpp)
|
||||||
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -102,7 +102,9 @@ static std::string format(const char * fmt, ...) {
|
||||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||||
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||||
|
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
||||||
#define KEY_USE_GELU "clip.use_gelu"
|
#define KEY_USE_GELU "clip.use_gelu"
|
||||||
|
#define KEY_USE_SILU "clip.use_silu"
|
||||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||||
#define KEY_N_BLOCK "clip.%s.block_count"
|
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||||
|
@ -129,7 +131,8 @@ static std::string format(const char * fmt, ...) {
|
||||||
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
#define TN_TOKEN_EMBD "%s.token_embd.weight"
|
||||||
#define TN_POS_EMBD "%s.position_embd.weight"
|
#define TN_POS_EMBD "%s.position_embd.weight"
|
||||||
#define TN_CLASS_EMBD "v.class_embd"
|
#define TN_CLASS_EMBD "v.class_embd"
|
||||||
#define TN_PATCH_EMBD "v.patch_embd.weight"
|
#define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat
|
||||||
|
#define TN_PATCH_EMBD_1 "v.patch_embd.weight.1"
|
||||||
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
||||||
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
||||||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||||
|
@ -163,6 +166,7 @@ enum projector_type {
|
||||||
PROJECTOR_TYPE_LDP,
|
PROJECTOR_TYPE_LDP,
|
||||||
PROJECTOR_TYPE_LDPV2,
|
PROJECTOR_TYPE_LDPV2,
|
||||||
PROJECTOR_TYPE_RESAMPLER,
|
PROJECTOR_TYPE_RESAMPLER,
|
||||||
|
PROJECTOR_TYPE_MERGER,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -171,6 +175,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||||
|
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -463,7 +468,8 @@ struct clip_vision_model {
|
||||||
|
|
||||||
// embeddings
|
// embeddings
|
||||||
struct ggml_tensor * class_embedding;
|
struct ggml_tensor * class_embedding;
|
||||||
struct ggml_tensor * patch_embeddings;
|
struct ggml_tensor * patch_embeddings_0;
|
||||||
|
struct ggml_tensor * patch_embeddings_1; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
|
||||||
struct ggml_tensor * patch_bias;
|
struct ggml_tensor * patch_bias;
|
||||||
struct ggml_tensor * position_embeddings;
|
struct ggml_tensor * position_embeddings;
|
||||||
|
|
||||||
|
@ -553,6 +559,7 @@ struct clip_ctx {
|
||||||
bool has_vision_encoder = false;
|
bool has_vision_encoder = false;
|
||||||
bool has_llava_projector = false;
|
bool has_llava_projector = false;
|
||||||
bool has_minicpmv_projector = false;
|
bool has_minicpmv_projector = false;
|
||||||
|
bool has_qwen2vl_merger = false;
|
||||||
int minicpmv_version = 2;
|
int minicpmv_version = 2;
|
||||||
|
|
||||||
struct clip_vision_model vision_model;
|
struct clip_vision_model vision_model;
|
||||||
|
@ -561,6 +568,7 @@ struct clip_ctx {
|
||||||
float image_mean[3];
|
float image_mean[3];
|
||||||
float image_std[3];
|
float image_std[3];
|
||||||
bool use_gelu = false;
|
bool use_gelu = false;
|
||||||
|
bool use_silu = false;
|
||||||
int32_t ftype = 1;
|
int32_t ftype = 1;
|
||||||
|
|
||||||
bool has_class_embedding = true;
|
bool has_class_embedding = true;
|
||||||
|
@ -606,14 +614,26 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
image_size_height = imgs->data->ny;
|
image_size_height = imgs->data->ny;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (ctx->has_qwen2vl_merger) {
|
||||||
|
// use the image's native resolution when image is avaible
|
||||||
|
if (is_inf) {
|
||||||
|
// if (imgs->data->nx && imgs->data->ny) {
|
||||||
|
image_size_width = imgs->data->nx;
|
||||||
|
image_size_height = imgs->data->ny;
|
||||||
|
}
|
||||||
|
}
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
|
const int patches_w = image_size_width / patch_size;
|
||||||
|
const int patches_h = image_size_height / patch_size;
|
||||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||||
|
const int num_position_ids = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
|
||||||
const int hidden_size = hparams.hidden_size;
|
const int hidden_size = hparams.hidden_size;
|
||||||
const int n_head = hparams.n_head;
|
const int n_head = hparams.n_head;
|
||||||
const int d_head = hidden_size / n_head;
|
const int d_head = hidden_size / n_head;
|
||||||
int n_layer = hparams.n_layer;
|
int n_layer = hparams.n_layer;
|
||||||
const float eps = hparams.eps;
|
const float eps = hparams.eps;
|
||||||
|
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||||
|
|
||||||
const int batch_size = imgs->size;
|
const int batch_size = imgs->size;
|
||||||
|
|
||||||
|
@ -634,10 +654,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
ggml_set_name(inp_raw, "inp_raw");
|
ggml_set_name(inp_raw, "inp_raw");
|
||||||
ggml_set_input(inp_raw);
|
ggml_set_input(inp_raw);
|
||||||
|
|
||||||
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
|
||||||
|
if (ctx->has_qwen2vl_merger) {
|
||||||
|
GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
|
||||||
|
GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
|
||||||
|
|
||||||
|
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||||
|
inp = ggml_add(ctx0, inp, inp_1);
|
||||||
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
|
||||||
|
inp = ggml_reshape_4d(
|
||||||
|
ctx0, inp,
|
||||||
|
hidden_size * 2, patches_w / 2, patches_h, batch_size);
|
||||||
|
inp = ggml_reshape_4d(
|
||||||
|
ctx0, inp,
|
||||||
|
hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
|
||||||
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
||||||
|
inp = ggml_reshape_3d(
|
||||||
|
ctx0, inp,
|
||||||
|
hidden_size, patches_w * patches_h, batch_size);
|
||||||
|
}
|
||||||
|
else {
|
||||||
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
||||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx->has_patch_bias) {
|
if (ctx->has_patch_bias) {
|
||||||
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
||||||
|
@ -659,12 +699,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||||
ggml_set_name(positions, "positions");
|
ggml_set_name(positions, "positions");
|
||||||
ggml_set_input(positions);
|
ggml_set_input(positions);
|
||||||
|
|
||||||
|
if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
|
||||||
embeddings =
|
embeddings =
|
||||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx->has_minicpmv_projector) {
|
if (ctx->has_minicpmv_projector) {
|
||||||
int pos_w = image_size_width/patch_size;
|
int pos_w = image_size_width/patch_size;
|
||||||
|
@ -688,7 +730,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
}
|
}
|
||||||
|
|
||||||
// loop over layers
|
// loop over layers
|
||||||
if (ctx->has_minicpmv_projector) {
|
if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
|
||||||
|
// TODO: figure out why we doing thing in this way ???
|
||||||
n_layer += 1;
|
n_layer += 1;
|
||||||
}
|
}
|
||||||
for (int il = 0; il < n_layer - 1; il++) {
|
for (int il = 0; il < n_layer - 1; il++) {
|
||||||
|
@ -710,8 +753,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
struct ggml_tensor * Q =
|
struct ggml_tensor * Q =
|
||||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
|
||||||
|
|
||||||
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
|
||||||
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
|
||||||
|
if (ctx->has_qwen2vl_merger) {
|
||||||
|
Q = ggml_rope_multi(
|
||||||
|
ctx0, Q, positions, nullptr,
|
||||||
|
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||||
|
}
|
||||||
|
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||||
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||||
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
|
||||||
|
|
||||||
|
@ -719,6 +767,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
|
||||||
|
|
||||||
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
||||||
|
if (ctx->has_qwen2vl_merger) {
|
||||||
|
K = ggml_rope_multi(
|
||||||
|
ctx0, K, positions, nullptr,
|
||||||
|
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
|
||||||
|
}
|
||||||
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||||
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
||||||
|
|
||||||
|
@ -758,6 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
|
|
||||||
if (ctx->use_gelu) {
|
if (ctx->use_gelu) {
|
||||||
cur = ggml_gelu_inplace(ctx0, cur);
|
cur = ggml_gelu_inplace(ctx0, cur);
|
||||||
|
} else if (ctx->use_silu) {
|
||||||
|
cur = ggml_silu_inplace(ctx0, cur);
|
||||||
} else {
|
} else {
|
||||||
cur = ggml_gelu_quick_inplace(ctx0, cur);
|
cur = ggml_gelu_quick_inplace(ctx0, cur);
|
||||||
}
|
}
|
||||||
|
@ -769,6 +824,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
cur = ggml_add(ctx0, embeddings, cur);
|
cur = ggml_add(ctx0, embeddings, cur);
|
||||||
|
|
||||||
embeddings = cur;
|
embeddings = cur;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// post-layernorm
|
// post-layernorm
|
||||||
|
@ -1030,6 +1086,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||||
|
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
||||||
|
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||||
|
|
||||||
|
// GELU activation
|
||||||
|
embeddings = ggml_gelu(ctx0, embeddings);
|
||||||
|
|
||||||
|
// Second linear layer
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
|
||||||
|
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
|
||||||
|
}
|
||||||
|
|
||||||
// build the graph
|
// build the graph
|
||||||
ggml_build_forward_expand(gf, embeddings);
|
ggml_build_forward_expand(gf, embeddings);
|
||||||
|
@ -1206,6 +1275,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
|
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
|
||||||
|
if (idx != -1) {
|
||||||
|
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
|
||||||
|
}
|
||||||
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
||||||
|
|
||||||
GGML_ASSERT(new_clip->has_vision_encoder);
|
GGML_ASSERT(new_clip->has_vision_encoder);
|
||||||
|
@ -1214,6 +1287,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
idx = get_key_idx(ctx, KEY_USE_GELU);
|
idx = get_key_idx(ctx, KEY_USE_GELU);
|
||||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||||
|
|
||||||
|
try {
|
||||||
|
idx = get_key_idx(ctx, KEY_USE_SILU);
|
||||||
|
new_clip->use_silu = gguf_get_val_bool(ctx, idx);
|
||||||
|
} catch (std::runtime_error & /*e*/) {
|
||||||
|
new_clip->use_silu = false;
|
||||||
|
}
|
||||||
|
|
||||||
if (verbosity >= 1) {
|
if (verbosity >= 1) {
|
||||||
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||||
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||||
|
@ -1389,11 +1469,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||||
} catch(const std::exception& /*e*/) {
|
} catch(const std::exception& /*e*/) {
|
||||||
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
||||||
}
|
}
|
||||||
|
try {
|
||||||
|
vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1);
|
||||||
|
} catch(const std::exception& /*e*/) {
|
||||||
|
new_clip->has_qwen2vl_merger = false;
|
||||||
|
}
|
||||||
|
|
||||||
// LLaVA projection
|
// LLaVA projection
|
||||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||||
|
@ -1481,6 +1566,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
||||||
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
||||||
}
|
}
|
||||||
|
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||||
|
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||||
|
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||||
|
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||||
|
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
|
@ -1519,6 +1610,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
|
batch.data = nullptr;
|
||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
|
@ -1532,6 +1624,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size
|
||||||
ctx_clip->load_image_size = load_image_size;
|
ctx_clip->load_image_size = load_image_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
|
||||||
|
return ctx_clip->load_image_size;
|
||||||
|
}
|
||||||
|
|
||||||
struct clip_image_size * clip_image_size_init() {
|
struct clip_image_size * clip_image_size_init() {
|
||||||
struct clip_image_size * load_image_size = new struct clip_image_size();
|
struct clip_image_size * load_image_size = new struct clip_image_size();
|
||||||
load_image_size->width = 448;
|
load_image_size->width = 448;
|
||||||
|
@ -1984,6 +2080,23 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
else if (ctx->has_qwen2vl_merger) {
|
||||||
|
clip_image_u8 * resized = clip_image_u8_init();
|
||||||
|
auto patch_size = clip_patch_size(ctx) * 2;
|
||||||
|
int nx = ceil((float)img->nx / patch_size) * patch_size;
|
||||||
|
int ny = ceil((float)img->ny / patch_size) * patch_size;
|
||||||
|
bicubic_resize(*img, *resized, nx, ny);
|
||||||
|
|
||||||
|
res_imgs->data = new clip_image_f32[1];
|
||||||
|
// clip_image_f32 * res = clip_image_f32_init();
|
||||||
|
normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
|
||||||
|
// res_imgs->data[0] = *res;
|
||||||
|
res_imgs->size = 1;
|
||||||
|
|
||||||
|
// clip_image_f32_free(res);
|
||||||
|
clip_image_u8_free(resized);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool pad_to_square = true;
|
bool pad_to_square = true;
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
|
@ -2173,6 +2286,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||||
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
|
||||||
|
clip_image_f32 img;
|
||||||
|
img.nx = img_w;
|
||||||
|
img.ny = img_h;
|
||||||
|
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||||
|
}
|
||||||
|
|
||||||
int32_t clip_image_size(const struct clip_ctx * ctx) {
|
int32_t clip_image_size(const struct clip_ctx * ctx) {
|
||||||
return ctx->vision_model.hparams.image_size;
|
return ctx->vision_model.hparams.image_size;
|
||||||
}
|
}
|
||||||
|
@ -2194,6 +2314,13 @@ const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||||
|
clip_image_f32 img;
|
||||||
|
img.nx = ctx->vision_model.hparams.image_size;
|
||||||
|
img.ny = ctx->vision_model.hparams.image_size;
|
||||||
|
return clip_n_patches_by_img(ctx, &img);
|
||||||
|
}
|
||||||
|
|
||||||
|
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||||
const auto & params = ctx->vision_model.hparams;
|
const auto & params = ctx->vision_model.hparams;
|
||||||
|
|
||||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||||
|
@ -2207,6 +2334,11 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
||||||
else if (ctx->minicpmv_version == 3) {
|
else if (ctx->minicpmv_version == 3) {
|
||||||
n_patches = 64;
|
n_patches = 64;
|
||||||
}
|
}
|
||||||
|
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||||
|
int patch_size = params.patch_size * 2;
|
||||||
|
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
||||||
|
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
||||||
|
n_patches = x_patch * y_patch;
|
||||||
}
|
}
|
||||||
|
|
||||||
return n_patches;
|
return n_patches;
|
||||||
|
@ -2335,7 +2467,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
const int image_size = hparams.image_size;
|
const int image_size = hparams.image_size;
|
||||||
int image_size_width = image_size;
|
int image_size_width = image_size;
|
||||||
int image_size_height = image_size;
|
int image_size_height = image_size;
|
||||||
if (ctx->has_minicpmv_projector) {
|
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
|
||||||
image_size_width = imgs->data[0].nx;
|
image_size_width = imgs->data[0].nx;
|
||||||
image_size_height = imgs->data[0].ny;
|
image_size_height = imgs->data[0].ny;
|
||||||
}
|
}
|
||||||
|
@ -2355,7 +2487,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
for (size_t i = 0; i < imgs->size; i++) {
|
for (size_t i = 0; i < imgs->size; i++) {
|
||||||
const int nx = imgs->data[i].nx;
|
const int nx = imgs->data[i].nx;
|
||||||
const int ny = imgs->data[i].ny;
|
const int ny = imgs->data[i].ny;
|
||||||
if (!ctx->has_minicpmv_projector) {
|
if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
|
||||||
GGML_ASSERT(nx == image_size && ny == image_size);
|
GGML_ASSERT(nx == image_size && ny == image_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2435,7 +2567,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx->has_qwen2vl_merger) {
|
||||||
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||||
|
|
||||||
|
const int pw = image_size_width / patch_size;
|
||||||
|
const int ph = image_size_height / patch_size;
|
||||||
|
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||||
|
|
||||||
|
int ptr = 0;
|
||||||
|
for (int y = 0; y < ph; y+=2)
|
||||||
{
|
{
|
||||||
|
for (int x = 0; x < pw; x+=2)
|
||||||
|
{
|
||||||
|
for (int dy = 0; dy < 2; dy++) {
|
||||||
|
for (int dx = 0; dx < 2; dx++) {
|
||||||
|
positions_data[ptr] = y + dy;
|
||||||
|
positions_data[num_patches + ptr] = x + dx;
|
||||||
|
positions_data[num_patches * 2 + ptr] = y + dy;
|
||||||
|
positions_data[num_patches * 3 + ptr] = x + dx;
|
||||||
|
ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||||
|
free(positions_data);
|
||||||
|
}
|
||||||
|
else {
|
||||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||||
|
|
||||||
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||||
|
@ -2444,7 +2603,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||||
free(positions_data);
|
free(positions_data);
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||||
|
@ -2456,6 +2614,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
free(patches_data);
|
free(patches_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||||
|
@ -2626,6 +2785,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||||
return 3584;
|
return 3584;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||||
|
return ctx->vision_model.mm_1_b->ne[0];
|
||||||
|
}
|
||||||
|
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
|
@ -2637,3 +2799,21 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
||||||
|
return ctx->has_qwen2vl_merger;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
||||||
|
clip_image_f32 clip_img;
|
||||||
|
clip_img.buf.resize(h * w * 3);
|
||||||
|
for (int i = 0; i < h*w*3; i++)
|
||||||
|
{
|
||||||
|
clip_img.buf[i] = img[i];
|
||||||
|
}
|
||||||
|
clip_img.nx = w;
|
||||||
|
clip_img.ny = h;
|
||||||
|
clip_image_encode(ctx, n_threads, &clip_img, vec);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
|
@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
||||||
|
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
|
||||||
|
|
||||||
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
|
||||||
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
|
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
|
||||||
|
@ -56,10 +57,12 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
||||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||||
|
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
|
||||||
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
|
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
|
||||||
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
|
||||||
|
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
|
||||||
|
|
||||||
CLIP_API struct clip_image_size * clip_image_size_init();
|
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||||
|
@ -86,6 +89,9 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
|
||||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||||
|
|
||||||
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
|
||||||
|
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -259,18 +259,24 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
|
|
||||||
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
||||||
|
|
||||||
if (clip_is_minicpmv(ctx_clip)) {
|
if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
|
||||||
std::vector<float *> image_embd_v;
|
std::vector<float *> image_embd_v;
|
||||||
image_embd_v.resize(img_res_v.size);
|
image_embd_v.resize(img_res_v.size);
|
||||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||||
|
|
||||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
for (size_t i = 0; i < img_res_v.size; i++) {
|
||||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
||||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
|
||||||
int patch_size=14;
|
int patch_size=14;
|
||||||
load_image_size->width = img_res_v.data[i].nx;
|
load_image_size->width = img_res_v.data[i].nx;
|
||||||
load_image_size->height = img_res_v.data[i].ny;
|
load_image_size->height = img_res_v.data[i].ny;
|
||||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||||
|
|
||||||
bool encoded = false;
|
bool encoded = false;
|
||||||
|
if (clip_is_qwen2vl(ctx_clip)) {
|
||||||
|
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||||
|
}
|
||||||
|
else {
|
||||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
||||||
if (has_minicpmv_projector == 2) {
|
if (has_minicpmv_projector == 2) {
|
||||||
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
||||||
|
@ -278,6 +284,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
else if (has_minicpmv_projector == 3) {
|
else if (has_minicpmv_projector == 3) {
|
||||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||||
return false;
|
return false;
|
||||||
|
@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
|
|
||||||
int n_img_pos_out = 0;
|
int n_img_pos_out = 0;
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
std::memcpy(
|
||||||
n_img_pos_out += clip_n_patches(ctx_clip);
|
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
|
||||||
|
image_embd_v[i],
|
||||||
|
clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
|
||||||
|
n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
|
||||||
}
|
}
|
||||||
*n_img_pos = n_img_pos_out;
|
*n_img_pos = n_img_pos_out;
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
|
@ -387,7 +398,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
||||||
if (clip_is_minicpmv(ctx_clip)) {
|
if (clip_is_minicpmv(ctx_clip)) {
|
||||||
num_max_patches = 10;
|
num_max_patches = 10;
|
||||||
}
|
}
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
float * image_embd;
|
||||||
|
if (clip_is_qwen2vl(ctx_clip)) {
|
||||||
|
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
|
||||||
|
image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny));
|
||||||
|
} else {
|
||||||
|
image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
||||||
|
}
|
||||||
if (!image_embd) {
|
if (!image_embd) {
|
||||||
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
||||||
return false;
|
return false;
|
||||||
|
|
158
examples/llava/qwen2_vl_surgery.py
Normal file
158
examples/llava/qwen2_vl_surgery.py
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
import argparse
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from gguf import *
|
||||||
|
from transformers import (
|
||||||
|
Qwen2VLForConditionalGeneration,
|
||||||
|
Qwen2VLProcessor,
|
||||||
|
AutoProcessor,
|
||||||
|
Qwen2VLConfig
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
VISION = "clip.vision"
|
||||||
|
|
||||||
|
|
||||||
|
def k(raw_key: str, arch: str) -> str:
|
||||||
|
return raw_key.format(arch=arch)
|
||||||
|
|
||||||
|
|
||||||
|
def to_gguf_name(name: str) -> str:
|
||||||
|
og = name
|
||||||
|
name = name.replace("text_model", "t").replace("vision_model", "v")
|
||||||
|
name = name.replace("blocks", "blk").replace("embeddings.", "")
|
||||||
|
name = name.replace("attn.", "attn_")
|
||||||
|
name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
|
||||||
|
# name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
|
||||||
|
name = name.replace("norm1", "ln1").replace("norm2", "ln2")
|
||||||
|
name = name.replace("merger.mlp", 'mm')
|
||||||
|
print(f"[to_gguf_name] {og} --> {name}")
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
|
||||||
|
vision_model = qwen2vl.visual
|
||||||
|
tensor_map = {}
|
||||||
|
for name, ten in vision_model.state_dict().items():
|
||||||
|
ten = ten.numpy()
|
||||||
|
if 'qkv' in name:
|
||||||
|
if ten.ndim == 2: # weight
|
||||||
|
c3, _ = ten.shape
|
||||||
|
else: # bias
|
||||||
|
c3 = ten.shape[0]
|
||||||
|
assert c3 % 3 == 0
|
||||||
|
c = c3 // 3
|
||||||
|
wq = ten[:c]
|
||||||
|
wk = ten[c: c * 2]
|
||||||
|
wv = ten[c * 2:]
|
||||||
|
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
|
||||||
|
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
|
||||||
|
tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
|
||||||
|
elif 'merger' in name:
|
||||||
|
if name.endswith("ln_q.weight"):
|
||||||
|
tensor_map['v.post_ln.weight'] = ten
|
||||||
|
elif name.endswith("ln_q.bias"):
|
||||||
|
tensor_map['v.post_ln.bias'] = ten
|
||||||
|
else:
|
||||||
|
# "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
|
||||||
|
tensor_map[to_gguf_name(name)] = ten
|
||||||
|
elif 'patch_embed.proj.weight' in name:
|
||||||
|
# NOTE: split Conv3D into Conv2Ds
|
||||||
|
c1, c2, kt, kh, kw = ten.shape
|
||||||
|
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
|
||||||
|
tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
|
||||||
|
tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
|
||||||
|
else:
|
||||||
|
tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
|
||||||
|
|
||||||
|
for new_name, ten in tensor_map.items():
|
||||||
|
if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
|
||||||
|
tensor_map[new_name] = ten.astype(np.float32)
|
||||||
|
else:
|
||||||
|
tensor_map[new_name] = ten.astype(dtype)
|
||||||
|
tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32) # dummy tensor, just here as a placeholder
|
||||||
|
return tensor_map
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
if args.data_type == 'fp32':
|
||||||
|
dtype = torch.float32
|
||||||
|
np_dtype = np.float32
|
||||||
|
ftype = 0
|
||||||
|
elif args.data_type == 'fp16':
|
||||||
|
dtype = torch.float32
|
||||||
|
np_dtype = np.float16
|
||||||
|
ftype = 1
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
model_name = args.model_name
|
||||||
|
print("model_name: ", model_name)
|
||||||
|
qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
|
||||||
|
model_name, torch_dtype=dtype, device_map="cpu"
|
||||||
|
)
|
||||||
|
cfg: Qwen2VLConfig = qwen2vl.config # type: ignore[reportAssignmentType]
|
||||||
|
vcfg = cfg.vision_config
|
||||||
|
|
||||||
|
if os.path.isdir(model_name):
|
||||||
|
if model_name.endswith(os.sep):
|
||||||
|
model_name = model_name[:-1]
|
||||||
|
model_name = os.path.basename(model_name)
|
||||||
|
fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf"
|
||||||
|
|
||||||
|
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||||
|
fout.add_description("image encoder for Qwen2VL")
|
||||||
|
|
||||||
|
fout.add_file_type(ftype)
|
||||||
|
fout.add_bool("clip.has_text_encoder", False)
|
||||||
|
fout.add_bool("clip.has_vision_encoder", True)
|
||||||
|
fout.add_bool("clip.has_qwen2vl_merger", True)
|
||||||
|
fout.add_string("clip.projector_type", "qwen2vl_merger")
|
||||||
|
|
||||||
|
print(cfg.vision_config)
|
||||||
|
if 'silu' in cfg.vision_config.hidden_act.lower():
|
||||||
|
fout.add_bool("clip.use_silu", True)
|
||||||
|
fout.add_bool("clip.use_gelu", False)
|
||||||
|
elif 'gelu' in cfg.vision_config.hidden_act.lower():
|
||||||
|
fout.add_bool("clip.use_silu", False)
|
||||||
|
fout.add_bool("clip.use_gelu", 'quick' not in cfg.vision_config.hidden_act.lower())
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
tensor_map = find_vision_tensors(qwen2vl, np_dtype)
|
||||||
|
for name, data in tensor_map.items():
|
||||||
|
fout.add_tensor(name, data)
|
||||||
|
|
||||||
|
fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
|
||||||
|
fout.add_uint32("clip.vision.image_size", 14 * 40) # some reasonable size that is divable by (14*2)
|
||||||
|
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
|
||||||
|
fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
|
||||||
|
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
|
||||||
|
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||||
|
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
|
||||||
|
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 0) # not sure what this does, put 0 here as a placeholder
|
||||||
|
fout.add_name(model_name)
|
||||||
|
"""
|
||||||
|
HACK: Since vision rope related parameter aren't stored in the `Qwen2VLConfig,
|
||||||
|
it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name)
|
||||||
|
fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue]
|
||||||
|
fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue]
|
||||||
|
|
||||||
|
fout.write_header_to_file()
|
||||||
|
fout.write_kv_data_to_file()
|
||||||
|
fout.write_tensors_to_file()
|
||||||
|
fout.close()
|
||||||
|
print("save model as: ", fname_out)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
|
||||||
|
parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
581
examples/llava/qwen2vl-cli.cpp
Normal file
581
examples/llava/qwen2vl-cli.cpp
Normal file
|
@ -0,0 +1,581 @@
|
||||||
|
#include "arg.h"
|
||||||
|
#include "base64.hpp"
|
||||||
|
#include "log.h"
|
||||||
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "clip.h"
|
||||||
|
#include "llava.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
#ifdef NDEBUG
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
|
||||||
|
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
||||||
|
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
|
||||||
|
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||||
|
const int patch_size = 14 * 2;
|
||||||
|
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
|
||||||
|
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
|
||||||
|
auto img_tokens = image_embed->n_image_pos;
|
||||||
|
// llama_pos mrope_pos[img_tokens * 4];
|
||||||
|
std::vector<llama_pos> mrope_pos;
|
||||||
|
mrope_pos.resize(img_tokens * 4);
|
||||||
|
|
||||||
|
for (int y = 0; y < ph; y++)
|
||||||
|
{
|
||||||
|
for (int x = 0; x < pw; x++)
|
||||||
|
{
|
||||||
|
int i = y * pw + x;
|
||||||
|
mrope_pos[i] = *st_pos_id;
|
||||||
|
mrope_pos[i + img_tokens] = *st_pos_id + y;
|
||||||
|
mrope_pos[i + img_tokens * 2] = *st_pos_id + x;
|
||||||
|
mrope_pos[i + img_tokens * 3] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*st_pos_id += std::max(pw, ph);
|
||||||
|
|
||||||
|
int processed = 0;
|
||||||
|
std::vector<llama_pos> batch_mrope_pos;
|
||||||
|
batch_mrope_pos.resize(img_tokens * 4);
|
||||||
|
|
||||||
|
for (int i = 0; i < img_tokens; i += n_batch) {
|
||||||
|
int n_eval = img_tokens - i;
|
||||||
|
if (n_eval > n_batch) {
|
||||||
|
n_eval = n_batch;
|
||||||
|
}
|
||||||
|
|
||||||
|
// llama_pos batch_mrope_pos[n_eval * 4];
|
||||||
|
std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0);
|
||||||
|
memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos));
|
||||||
|
memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos));
|
||||||
|
memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
|
||||||
|
memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
|
||||||
|
|
||||||
|
llama_batch batch = {
|
||||||
|
int32_t(n_eval), // n_tokens
|
||||||
|
nullptr, // token
|
||||||
|
(image_embed->embed+i*n_embd), // embed
|
||||||
|
batch_mrope_pos.data(), // pos
|
||||||
|
nullptr, // n_seq_id
|
||||||
|
nullptr, // seq_id
|
||||||
|
nullptr, // logits
|
||||||
|
};
|
||||||
|
|
||||||
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*n_past += n_eval;
|
||||||
|
processed += n_eval;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
|
||||||
|
int N = (int) tokens.size();
|
||||||
|
std::vector<llama_pos> pos;
|
||||||
|
for (int i = 0; i < N; i += n_batch) {
|
||||||
|
int n_eval = (int) tokens.size() - i;
|
||||||
|
if (n_eval > n_batch) {
|
||||||
|
n_eval = n_batch;
|
||||||
|
}
|
||||||
|
auto batch = llama_batch_get_one(&tokens[i], n_eval);
|
||||||
|
// TODO: add mrope pos ids somewhere else
|
||||||
|
pos.resize(batch.n_tokens * 4);
|
||||||
|
std::fill(pos.begin(), pos.end(), 0);
|
||||||
|
for (int j = 0; j < batch.n_tokens * 3; j ++) {
|
||||||
|
pos[j] = *st_pos_id + (j % batch.n_tokens);
|
||||||
|
}
|
||||||
|
batch.pos = pos.data();
|
||||||
|
|
||||||
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
|
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
*n_past += n_eval;
|
||||||
|
*st_pos_id += n_eval;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past, int * st_pos_id) {
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
tokens.push_back(id);
|
||||||
|
return eval_tokens(ctx_llama, tokens, 1, n_past, st_pos_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, int * st_pos_id, bool add_bos){
|
||||||
|
std::string str2 = str;
|
||||||
|
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||||
|
eval_tokens(ctx_llama, embd_inp, n_batch, n_past, st_pos_id);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * sample(struct common_sampler * smpl,
|
||||||
|
struct llama_context * ctx_llama,
|
||||||
|
int * n_past, int * st_pos_id) {
|
||||||
|
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||||
|
common_sampler_accept(smpl, id, true);
|
||||||
|
static std::string ret;
|
||||||
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
|
ret = "</s>";
|
||||||
|
} else {
|
||||||
|
ret = common_token_to_piece(ctx_llama, id);
|
||||||
|
}
|
||||||
|
eval_id(ctx_llama, id, n_past, st_pos_id);
|
||||||
|
return ret.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
|
||||||
|
static const char* IMG_BASE64_TAG_END = "\">";
|
||||||
|
|
||||||
|
static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
|
||||||
|
begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
|
||||||
|
end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool prompt_contains_image(const std::string& prompt) {
|
||||||
|
size_t begin, end;
|
||||||
|
find_image_tag_in_prompt(prompt, begin, end);
|
||||||
|
return (begin != std::string::npos);
|
||||||
|
}
|
||||||
|
|
||||||
|
// replaces the base64 image tag in the prompt with `replacement`
|
||||||
|
static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
|
||||||
|
size_t img_base64_str_start, img_base64_str_end;
|
||||||
|
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||||
|
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||||
|
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
|
||||||
|
auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
|
||||||
|
auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
|
||||||
|
|
||||||
|
auto required_bytes = base64::required_encode_size(base64_str.size());
|
||||||
|
auto img_bytes = std::vector<unsigned char>(required_bytes);
|
||||||
|
base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
|
||||||
|
|
||||||
|
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||||
|
if (!embed) {
|
||||||
|
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return embed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
|
||||||
|
size_t begin, end;
|
||||||
|
find_image_tag_in_prompt(prompt, begin, end);
|
||||||
|
if (begin == std::string::npos || end == std::string::npos) {
|
||||||
|
return prompt;
|
||||||
|
}
|
||||||
|
auto pre = prompt.substr(0, begin);
|
||||||
|
auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
|
||||||
|
return pre + replacement + post;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct llava_context {
|
||||||
|
struct clip_ctx * ctx_clip = NULL;
|
||||||
|
struct llama_context * ctx_llama = NULL;
|
||||||
|
struct llama_model * model = NULL;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void print_usage(int, char ** argv) {
|
||||||
|
LOG("\n example usage:\n");
|
||||||
|
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
|
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
|
||||||
|
|
||||||
|
// load and preprocess the image
|
||||||
|
llava_image_embed * embed = NULL;
|
||||||
|
auto prompt = params->prompt;
|
||||||
|
if (prompt_contains_image(prompt)) {
|
||||||
|
if (!params->image.empty()) {
|
||||||
|
LOG_INF("using base64 encoded image instead of command line image path\n");
|
||||||
|
}
|
||||||
|
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||||
|
if (!embed) {
|
||||||
|
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
params->prompt = remove_image_from_prompt(prompt);
|
||||||
|
} else {
|
||||||
|
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
|
if (!embed) {
|
||||||
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return embed;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
|
||||||
|
int n_past = 0;
|
||||||
|
int cur_pos_id = 0;
|
||||||
|
|
||||||
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
|
|
||||||
|
std::string system_prompt, user_prompt;
|
||||||
|
size_t image_pos = prompt.find("<|vision_start|>");
|
||||||
|
if (image_pos != std::string::npos) {
|
||||||
|
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||||
|
system_prompt = prompt.substr(0, image_pos);
|
||||||
|
user_prompt = prompt.substr(image_pos + std::string("<|vision_pad|>").length());
|
||||||
|
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||||
|
if (params->verbose_prompt) {
|
||||||
|
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||||
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||||
|
if (params->verbose_prompt) {
|
||||||
|
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||||
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// llava-1.5 native mode
|
||||||
|
system_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|>";
|
||||||
|
user_prompt = "<|vision_end|>" + prompt + "<|im_end|>\n<|im_start|>assistant\n";
|
||||||
|
if (params->verbose_prompt) {
|
||||||
|
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||||
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, true);
|
||||||
|
if (image_embed != nullptr) {
|
||||||
|
auto image_size = clip_get_load_image_size(ctx_llava->ctx_clip);
|
||||||
|
qwen2vl_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past, &cur_pos_id, image_size);
|
||||||
|
}
|
||||||
|
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, &cur_pos_id, false);
|
||||||
|
|
||||||
|
// generate the response
|
||||||
|
|
||||||
|
LOG("\n");
|
||||||
|
|
||||||
|
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
|
||||||
|
if (!smpl) {
|
||||||
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string response = "";
|
||||||
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
|
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past, &cur_pos_id);
|
||||||
|
response += tmp;
|
||||||
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
|
LOG("%s", tmp);
|
||||||
|
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||||
|
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||||
|
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||||
|
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
|
||||||
|
common_sampler_free(smpl);
|
||||||
|
LOG("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_model * llava_init(common_params * params) {
|
||||||
|
llama_backend_init();
|
||||||
|
llama_numa_init(params->numa);
|
||||||
|
|
||||||
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
|
if (model == NULL) {
|
||||||
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return model;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||||
|
const char * clip_path = params->mmproj.c_str();
|
||||||
|
|
||||||
|
auto prompt = params->prompt;
|
||||||
|
if (prompt.empty()) {
|
||||||
|
prompt = "describe the image in detail.";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||||
|
|
||||||
|
|
||||||
|
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||||
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||||
|
|
||||||
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
if (ctx_llama == NULL) {
|
||||||
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||||
|
|
||||||
|
ctx_llava->ctx_llama = ctx_llama;
|
||||||
|
ctx_llava->ctx_clip = ctx_clip;
|
||||||
|
ctx_llava->model = model;
|
||||||
|
return ctx_llava;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llava_free(struct llava_context * ctx_llava) {
|
||||||
|
if (ctx_llava->ctx_clip) {
|
||||||
|
clip_free(ctx_llava->ctx_clip);
|
||||||
|
ctx_llava->ctx_clip = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free(ctx_llava->ctx_llama);
|
||||||
|
llama_free_model(ctx_llava->model);
|
||||||
|
llama_backend_free();
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
|
||||||
|
static void debug_test_mrope_2d() {
|
||||||
|
// 1. Initialize backend
|
||||||
|
ggml_backend_t backend = NULL;
|
||||||
|
std::string backend_name = "";
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||||
|
backend = ggml_backend_cuda_init(0); // init device 0
|
||||||
|
backend_name = "cuda";
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
|
if (!backend) {
|
||||||
|
backend = ggml_backend_cpu_init();
|
||||||
|
backend_name = "cpu";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the size needed to allocate
|
||||||
|
size_t ctx_size = 0;
|
||||||
|
ctx_size += 2 * ggml_tensor_overhead(); // tensors
|
||||||
|
// no need to allocate anything else!
|
||||||
|
|
||||||
|
// 2. Allocate `ggml_context` to store tensor data
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ ctx_size,
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
|
||||||
|
};
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
|
||||||
|
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 12, 30);
|
||||||
|
ggml_set_name(inp_raw, "inp_raw");
|
||||||
|
ggml_set_input(inp_raw);
|
||||||
|
|
||||||
|
struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 30 * 4);
|
||||||
|
ggml_set_name(pos, "pos");
|
||||||
|
ggml_set_input(pos);
|
||||||
|
|
||||||
|
std::vector<float> dummy_q;
|
||||||
|
dummy_q.resize(128 * 12 * 30);
|
||||||
|
std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
|
||||||
|
// memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
|
||||||
|
|
||||||
|
std::vector<int> pos_id;
|
||||||
|
pos_id.resize(30 * 4);
|
||||||
|
for (int i = 0; i < 30; i ++) {
|
||||||
|
pos_id[i] = i;
|
||||||
|
pos_id[i + 30] = i + 10;
|
||||||
|
pos_id[i + 60] = i + 20;
|
||||||
|
pos_id[i + 90] = i + 30;
|
||||||
|
}
|
||||||
|
int sections[4] = {32, 32, 0, 0};
|
||||||
|
|
||||||
|
// 4. Allocate a `ggml_backend_buffer` to store all tensors
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||||
|
|
||||||
|
// 5. Copy tensor data from main memory (RAM) to backend buffer
|
||||||
|
ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
|
||||||
|
ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
|
||||||
|
|
||||||
|
// 6. Create a `ggml_cgraph` for mul_mat operation
|
||||||
|
struct ggml_cgraph * gf = NULL;
|
||||||
|
struct ggml_context * ctx_cgraph = NULL;
|
||||||
|
|
||||||
|
// create a temporally context to build the graph
|
||||||
|
struct ggml_init_params params0 = {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
|
||||||
|
};
|
||||||
|
ctx_cgraph = ggml_init(params0);
|
||||||
|
gf = ggml_new_graph(ctx_cgraph);
|
||||||
|
|
||||||
|
struct ggml_tensor * result0 = ggml_rope_multi(
|
||||||
|
ctx_cgraph, inp_raw, pos, nullptr,
|
||||||
|
128/2, sections, LLAMA_ROPE_TYPE_VISION, 32768, 1000000, 1,
|
||||||
|
0, 1, 32, 1);
|
||||||
|
|
||||||
|
// Add "result" tensor and all of its dependencies to the cgraph
|
||||||
|
ggml_build_forward_expand(gf, result0);
|
||||||
|
|
||||||
|
// 7. Create a `ggml_gallocr` for cgraph computation
|
||||||
|
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
||||||
|
ggml_gallocr_alloc_graph(allocr, gf);
|
||||||
|
|
||||||
|
// 9. Run the computation
|
||||||
|
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
|
||||||
|
if (ggml_backend_is_cpu(backend)) {
|
||||||
|
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
||||||
|
}
|
||||||
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
|
||||||
|
// 10. Retrieve results (output tensors)
|
||||||
|
// in this example, output tensor is always the last tensor in the graph
|
||||||
|
struct ggml_tensor * result = result0;
|
||||||
|
// struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
|
||||||
|
float * result_data = (float *)malloc(ggml_nbytes(result));
|
||||||
|
// because the tensor data is stored in device buffer, we need to copy it back to RAM
|
||||||
|
ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
|
||||||
|
const std::string bin_file = "mrope_2d_" + backend_name +".bin";
|
||||||
|
std::ofstream outFile(bin_file, std::ios::binary);
|
||||||
|
|
||||||
|
if (outFile.is_open()) {
|
||||||
|
outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
|
||||||
|
outFile.close();
|
||||||
|
std::cout << "Data successfully written to " + bin_file << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Error opening file!" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(result_data);
|
||||||
|
// 11. Free memory and exit
|
||||||
|
ggml_free(ctx_cgraph);
|
||||||
|
ggml_gallocr_free(allocr);
|
||||||
|
ggml_free(ctx);
|
||||||
|
ggml_backend_buffer_free(buffer);
|
||||||
|
ggml_backend_free(backend);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
|
||||||
|
int n_embd = llama_n_embd(llama_get_model(ctx_llava->ctx_llama));
|
||||||
|
int ne = n_embd * 4;
|
||||||
|
float vals[56 * 56 * 3];
|
||||||
|
// float embd[ne];
|
||||||
|
std::vector<float> embd;
|
||||||
|
embd.resize(ne);
|
||||||
|
|
||||||
|
for (int i = 0; i < 56*56; i++)
|
||||||
|
{
|
||||||
|
for (int c = 0; c < 3; c++)
|
||||||
|
vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
|
||||||
|
}
|
||||||
|
|
||||||
|
clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
|
||||||
|
|
||||||
|
std::ofstream outFile("img_embed.bin", std::ios::binary);
|
||||||
|
if (outFile.is_open()) {
|
||||||
|
outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
|
||||||
|
|
||||||
|
outFile.close();
|
||||||
|
std::cout << "Data successfully written to mrope.bin" << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Error opening file!" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
common_params params;
|
||||||
|
|
||||||
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
common_init();
|
||||||
|
|
||||||
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto * model = llava_init(¶ms);
|
||||||
|
if (model == NULL) {
|
||||||
|
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prompt_contains_image(params.prompt)) {
|
||||||
|
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
|
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
||||||
|
|
||||||
|
// process the prompt
|
||||||
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
|
llava_image_embed_free(image_embed);
|
||||||
|
ctx_llava->model = NULL;
|
||||||
|
llava_free(ctx_llava);
|
||||||
|
#ifndef NDEBUG
|
||||||
|
} else if (params.image[0].empty()) {
|
||||||
|
auto ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
|
debug_test_mrope_2d();
|
||||||
|
debug_dump_img_embed(ctx_llava);
|
||||||
|
|
||||||
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
|
ctx_llava->model = NULL;
|
||||||
|
llava_free(ctx_llava);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
for (auto & image : params.image) {
|
||||||
|
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
|
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
||||||
|
if (!image_embed) {
|
||||||
|
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// process the prompt
|
||||||
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
|
llava_image_embed_free(image_embed);
|
||||||
|
ctx_llava->model = NULL;
|
||||||
|
llava_free(ctx_llava);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
||||||
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
||||||
}
|
}
|
||||||
LOG_INF("Number of chunks: %ld\n", chunks.size());
|
LOG_INF("Number of chunks: %zu\n", chunks.size());
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET llama-run)
|
set(TARGET llama-run)
|
||||||
add_executable(${TARGET} run.cpp)
|
add_executable(${TARGET} run.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
|
|
@ -3,5 +3,45 @@
|
||||||
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
|
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./llama-run Meta-Llama-3.1-8B-Instruct.gguf
|
llama-run granite-code
|
||||||
|
...
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama-run -h
|
||||||
|
Description:
|
||||||
|
Runs a llm
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
llama-run [options] model [prompt]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-c, --context-size <value>
|
||||||
|
Context size (default: 2048)
|
||||||
|
-n, --ngl <value>
|
||||||
|
Number of GPU layers (default: 0)
|
||||||
|
-h, --help
|
||||||
|
Show help message
|
||||||
|
|
||||||
|
Commands:
|
||||||
|
model
|
||||||
|
Model is a string with an optional prefix of
|
||||||
|
huggingface:// (hf://), ollama://, https:// or file://.
|
||||||
|
If no protocol is specified and a file exists in the specified
|
||||||
|
path, file:// is assumed, otherwise if a file does not exist in
|
||||||
|
the specified path, ollama:// is assumed. Models that are being
|
||||||
|
pulled are downloaded with .partial extension while being
|
||||||
|
downloaded and then renamed as the file without the .partial
|
||||||
|
extension when complete.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
llama-run llama3
|
||||||
|
llama-run ollama://granite-code
|
||||||
|
llama-run ollama://smollm:135m
|
||||||
|
llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
|
||||||
|
llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
|
||||||
|
llama-run https://example.com/some-file1.gguf
|
||||||
|
llama-run some-file2.gguf
|
||||||
|
llama-run file://some-file3.gguf
|
||||||
|
llama-run --ngl 99 some-file4.gguf
|
||||||
|
llama-run --ngl 99 some-file5.gguf Hello World
|
||||||
...
|
...
|
||||||
|
|
|
@ -4,110 +4,330 @@
|
||||||
# include <unistd.h>
|
# include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <climits>
|
#if defined(LLAMA_USE_CURL)
|
||||||
|
# include <curl/curl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <cstdarg>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <filesystem>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "json.hpp"
|
||||||
#include "llama-cpp.h"
|
#include "llama-cpp.h"
|
||||||
|
|
||||||
typedef std::unique_ptr<char[]> char_array_ptr;
|
#define printe(...) \
|
||||||
|
do { \
|
||||||
|
fprintf(stderr, __VA_ARGS__); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
struct Argument {
|
class Opt {
|
||||||
std::string flag;
|
|
||||||
std::string help_text;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct Options {
|
|
||||||
std::string model_path, prompt_non_interactive;
|
|
||||||
int ngl = 99;
|
|
||||||
int n_ctx = 2048;
|
|
||||||
};
|
|
||||||
|
|
||||||
class ArgumentParser {
|
|
||||||
public:
|
public:
|
||||||
ArgumentParser(const char * program_name) : program_name(program_name) {}
|
int init(int argc, const char ** argv) {
|
||||||
|
construct_help_str_();
|
||||||
void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
|
// Parse arguments
|
||||||
string_args[flag] = &var;
|
if (parse(argc, argv)) {
|
||||||
arguments.push_back({flag, help_text});
|
printe("Error: Failed to parse arguments.\n");
|
||||||
|
help();
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
|
// If help is requested, show help and exit
|
||||||
int_args[flag] = &var;
|
if (help_) {
|
||||||
arguments.push_back({flag, help_text});
|
help();
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0; // Success
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string model_;
|
||||||
|
std::string user_;
|
||||||
|
int context_size_ = 2048, ngl_ = -1;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string help_str_;
|
||||||
|
bool help_ = false;
|
||||||
|
|
||||||
|
void construct_help_str_() {
|
||||||
|
help_str_ =
|
||||||
|
"Description:\n"
|
||||||
|
" Runs a llm\n"
|
||||||
|
"\n"
|
||||||
|
"Usage:\n"
|
||||||
|
" llama-run [options] model [prompt]\n"
|
||||||
|
"\n"
|
||||||
|
"Options:\n"
|
||||||
|
" -c, --context-size <value>\n"
|
||||||
|
" Context size (default: " +
|
||||||
|
std::to_string(context_size_);
|
||||||
|
help_str_ +=
|
||||||
|
")\n"
|
||||||
|
" -n, --ngl <value>\n"
|
||||||
|
" Number of GPU layers (default: " +
|
||||||
|
std::to_string(ngl_);
|
||||||
|
help_str_ +=
|
||||||
|
")\n"
|
||||||
|
" -h, --help\n"
|
||||||
|
" Show help message\n"
|
||||||
|
"\n"
|
||||||
|
"Commands:\n"
|
||||||
|
" model\n"
|
||||||
|
" Model is a string with an optional prefix of \n"
|
||||||
|
" huggingface:// (hf://), ollama://, https:// or file://.\n"
|
||||||
|
" If no protocol is specified and a file exists in the specified\n"
|
||||||
|
" path, file:// is assumed, otherwise if a file does not exist in\n"
|
||||||
|
" the specified path, ollama:// is assumed. Models that are being\n"
|
||||||
|
" pulled are downloaded with .partial extension while being\n"
|
||||||
|
" downloaded and then renamed as the file without the .partial\n"
|
||||||
|
" extension when complete.\n"
|
||||||
|
"\n"
|
||||||
|
"Examples:\n"
|
||||||
|
" llama-run llama3\n"
|
||||||
|
" llama-run ollama://granite-code\n"
|
||||||
|
" llama-run ollama://smollm:135m\n"
|
||||||
|
" llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
|
||||||
|
" llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
|
||||||
|
" llama-run https://example.com/some-file1.gguf\n"
|
||||||
|
" llama-run some-file2.gguf\n"
|
||||||
|
" llama-run file://some-file3.gguf\n"
|
||||||
|
" llama-run --ngl 99 some-file4.gguf\n"
|
||||||
|
" llama-run --ngl 99 some-file5.gguf Hello World\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
int parse(int argc, const char ** argv) {
|
int parse(int argc, const char ** argv) {
|
||||||
|
int positional_args_i = 0;
|
||||||
for (int i = 1; i < argc; ++i) {
|
for (int i = 1; i < argc; ++i) {
|
||||||
std::string arg = argv[i];
|
if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) {
|
||||||
if (string_args.count(arg)) {
|
if (i + 1 >= argc) {
|
||||||
if (i + 1 < argc) {
|
|
||||||
*string_args[arg] = argv[++i];
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
|
|
||||||
print_usage();
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
} else if (int_args.count(arg)) {
|
|
||||||
if (i + 1 < argc) {
|
context_size_ = std::atoi(argv[++i]);
|
||||||
if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
|
} else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) {
|
||||||
fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
|
if (i + 1 >= argc) {
|
||||||
print_usage();
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ngl_ = std::atoi(argv[++i]);
|
||||||
|
} else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
|
||||||
|
help_ = true;
|
||||||
|
return 0;
|
||||||
|
} else if (!positional_args_i) {
|
||||||
|
++positional_args_i;
|
||||||
|
model_ = argv[i];
|
||||||
|
} else if (positional_args_i == 1) {
|
||||||
|
++positional_args_i;
|
||||||
|
user_ = argv[i];
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: missing value for %s\n", arg.c_str());
|
user_ += " " + std::string(argv[i]);
|
||||||
print_usage();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
|
|
||||||
print_usage();
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (string_args["-m"]->empty()) {
|
return model_.empty(); // model_ is the only required value
|
||||||
fprintf(stderr, "error: -m is required\n");
|
}
|
||||||
print_usage();
|
|
||||||
|
void help() const { printf("%s", help_str_.c_str()); }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct progress_data {
|
||||||
|
size_t file_size = 0;
|
||||||
|
std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
|
||||||
|
bool printed = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FileDeleter {
|
||||||
|
void operator()(FILE * file) const {
|
||||||
|
if (file) {
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::unique_ptr<FILE, FileDeleter> FILE_ptr;
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_CURL
|
||||||
|
class CurlWrapper {
|
||||||
|
public:
|
||||||
|
int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
|
||||||
|
const bool progress, std::string * response_str = nullptr) {
|
||||||
|
std::string output_file_partial;
|
||||||
|
curl = curl_easy_init();
|
||||||
|
if (!curl) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
progress_data data;
|
||||||
|
FILE_ptr out;
|
||||||
|
if (!output_file.empty()) {
|
||||||
|
output_file_partial = output_file + ".partial";
|
||||||
|
out.reset(fopen(output_file_partial.c_str(), "ab"));
|
||||||
|
}
|
||||||
|
|
||||||
|
set_write_options(response_str, out);
|
||||||
|
data.file_size = set_resume_point(output_file_partial);
|
||||||
|
set_progress_options(progress, data);
|
||||||
|
set_headers(headers);
|
||||||
|
perform(url);
|
||||||
|
if (!output_file.empty()) {
|
||||||
|
std::filesystem::rename(output_file_partial, output_file);
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
~CurlWrapper() {
|
||||||
|
if (chunk) {
|
||||||
|
curl_slist_free_all(chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (curl) {
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
const char * program_name;
|
CURL * curl = nullptr;
|
||||||
std::unordered_map<std::string, std::string *> string_args;
|
struct curl_slist * chunk = nullptr;
|
||||||
std::unordered_map<std::string, int *> int_args;
|
|
||||||
std::vector<Argument> arguments;
|
|
||||||
|
|
||||||
int parse_int_arg(const char * arg, int & value) {
|
void set_write_options(std::string * response_str, const FILE_ptr & out) {
|
||||||
char * end;
|
if (response_str) {
|
||||||
const long val = std::strtol(arg, &end, 10);
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data);
|
||||||
if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str);
|
||||||
value = static_cast<int>(val);
|
} else {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t set_resume_point(const std::string & output_file) {
|
||||||
|
size_t file_size = 0;
|
||||||
|
if (std::filesystem::exists(output_file)) {
|
||||||
|
file_size = std::filesystem::file_size(output_file);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast<curl_off_t>(file_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
return file_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_progress_options(bool progress, progress_data & data) {
|
||||||
|
if (progress) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_headers(const std::vector<std::string> & headers) {
|
||||||
|
if (!headers.empty()) {
|
||||||
|
if (chunk) {
|
||||||
|
curl_slist_free_all(chunk);
|
||||||
|
chunk = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & header : headers) {
|
||||||
|
chunk = curl_slist_append(chunk, header.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void perform(const std::string & url) {
|
||||||
|
CURLcode res;
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
|
||||||
|
res = curl_easy_perform(curl);
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string human_readable_time(double seconds) {
|
||||||
|
int hrs = static_cast<int>(seconds) / 3600;
|
||||||
|
int mins = (static_cast<int>(seconds) % 3600) / 60;
|
||||||
|
int secs = static_cast<int>(seconds) % 60;
|
||||||
|
|
||||||
|
std::ostringstream out;
|
||||||
|
if (hrs > 0) {
|
||||||
|
out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0')
|
||||||
|
<< secs << "s";
|
||||||
|
} else if (mins > 0) {
|
||||||
|
out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s";
|
||||||
|
} else {
|
||||||
|
out << secs << "s";
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string human_readable_size(curl_off_t size) {
|
||||||
|
static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" };
|
||||||
|
char length = sizeof(suffix) / sizeof(suffix[0]);
|
||||||
|
int i = 0;
|
||||||
|
double dbl_size = size;
|
||||||
|
if (size > 1024) {
|
||||||
|
for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) {
|
||||||
|
dbl_size = size / 1024.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostringstream out;
|
||||||
|
out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i];
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
|
||||||
|
curl_off_t) {
|
||||||
|
progress_data * data = static_cast<progress_data *>(ptr);
|
||||||
|
if (total_to_download <= 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return 1;
|
|
||||||
|
total_to_download += data->file_size;
|
||||||
|
const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
|
||||||
|
const curl_off_t percentage = (now_downloaded_plus_file_size * 100) / total_to_download;
|
||||||
|
const curl_off_t pos = (percentage / 5);
|
||||||
|
std::string progress_bar;
|
||||||
|
for (int i = 0; i < 20; ++i) {
|
||||||
|
progress_bar.append((i < pos) ? "█" : " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_usage() const {
|
// Calculate download speed and estimated time to completion
|
||||||
printf("\nUsage:\n");
|
const auto now = std::chrono::steady_clock::now();
|
||||||
printf(" %s [OPTIONS]\n\n", program_name);
|
const std::chrono::duration<double> elapsed_seconds = now - data->start_time;
|
||||||
printf("Options:\n");
|
const double speed = now_downloaded / elapsed_seconds.count();
|
||||||
for (const auto & arg : arguments) {
|
const double estimated_time = (total_to_download - now_downloaded) / speed;
|
||||||
printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
|
printe("\r%ld%% |%s| %s/%s %.2f MB/s %s ", percentage, progress_bar.c_str(),
|
||||||
|
human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(),
|
||||||
|
speed / (1024 * 1024), human_readable_time(estimated_time).c_str());
|
||||||
|
fflush(stderr);
|
||||||
|
data->printed = true;
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n");
|
// Function to write data to a file
|
||||||
|
static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
|
||||||
|
FILE * out = static_cast<FILE *>(stream);
|
||||||
|
return fwrite(ptr, size, nmemb, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to capture data into a string
|
||||||
|
static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) {
|
||||||
|
std::string * str = static_cast<std::string *>(stream);
|
||||||
|
str->append(static_cast<char *>(ptr), size * nmemb);
|
||||||
|
return size * nmemb;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
class LlamaData {
|
class LlamaData {
|
||||||
public:
|
public:
|
||||||
|
@ -115,14 +335,16 @@ class LlamaData {
|
||||||
llama_sampler_ptr sampler;
|
llama_sampler_ptr sampler;
|
||||||
llama_context_ptr context;
|
llama_context_ptr context;
|
||||||
std::vector<llama_chat_message> messages;
|
std::vector<llama_chat_message> messages;
|
||||||
|
std::vector<std::string> msg_strs;
|
||||||
|
std::vector<char> fmtted;
|
||||||
|
|
||||||
int init(const Options & opt) {
|
int init(Opt & opt) {
|
||||||
model = initialize_model(opt.model_path, opt.ngl);
|
model = initialize_model(opt);
|
||||||
if (!model) {
|
if (!model) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
context = initialize_context(model, opt.n_ctx);
|
context = initialize_context(model, opt.context_size_);
|
||||||
if (!context) {
|
if (!context) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -132,14 +354,122 @@ class LlamaData {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Initializes the model and returns a unique pointer to it
|
#ifdef LLAMA_USE_CURL
|
||||||
llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
|
int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
|
||||||
llama_model_params model_params = llama_model_default_params();
|
const bool progress, std::string * response_str = nullptr) {
|
||||||
model_params.n_gpu_layers = ngl;
|
CurlWrapper curl;
|
||||||
|
if (curl.init(url, headers, output_file, progress, response_str)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
|
return 0;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
|
||||||
|
std::string * = nullptr) {
|
||||||
|
printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
|
||||||
|
// Find the second occurrence of '/' after protocol string
|
||||||
|
size_t pos = model.find('/');
|
||||||
|
pos = model.find('/', pos + 1);
|
||||||
|
if (pos == std::string::npos) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string hfr = model.substr(0, pos);
|
||||||
|
const std::string hff = model.substr(pos + 1);
|
||||||
|
const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
|
||||||
|
return download(url, headers, bn, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
|
||||||
|
if (model.find('/') == std::string::npos) {
|
||||||
|
model = "library/" + model;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string model_tag = "latest";
|
||||||
|
size_t colon_pos = model.find(':');
|
||||||
|
if (colon_pos != std::string::npos) {
|
||||||
|
model_tag = model.substr(colon_pos + 1);
|
||||||
|
model = model.substr(0, colon_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
|
||||||
|
std::string manifest_str;
|
||||||
|
const int ret = download(manifest_url, headers, "", false, &manifest_str);
|
||||||
|
if (ret) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
nlohmann::json manifest = nlohmann::json::parse(manifest_str);
|
||||||
|
std::string layer;
|
||||||
|
for (const auto & l : manifest["layers"]) {
|
||||||
|
if (l["mediaType"] == "application/vnd.ollama.image.model") {
|
||||||
|
layer = l["digest"];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
|
||||||
|
return download(blob_url, headers, bn, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string basename(const std::string & path) {
|
||||||
|
const size_t pos = path.find_last_of("/\\");
|
||||||
|
if (pos == std::string::npos) {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
|
return path.substr(pos + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int remove_proto(std::string & model_) {
|
||||||
|
const std::string::size_type pos = model_.find("://");
|
||||||
|
if (pos == std::string::npos) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
model_ = model_.substr(pos + 3); // Skip past "://"
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int resolve_model(std::string & model_) {
|
||||||
|
const std::string bn = basename(model_);
|
||||||
|
const std::vector<std::string> headers = { "--header",
|
||||||
|
"Accept: application/vnd.docker.distribution.manifest.v2+json" };
|
||||||
|
int ret = 0;
|
||||||
|
if (string_starts_with(model_, "file://") || std::filesystem::exists(bn)) {
|
||||||
|
remove_proto(model_);
|
||||||
|
} else if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
|
||||||
|
remove_proto(model_);
|
||||||
|
ret = huggingface_dl(model_, headers, bn);
|
||||||
|
} else if (string_starts_with(model_, "ollama://")) {
|
||||||
|
remove_proto(model_);
|
||||||
|
ret = ollama_dl(model_, headers, bn);
|
||||||
|
} else if (string_starts_with(model_, "https://")) {
|
||||||
|
download(model_, headers, bn, true);
|
||||||
|
} else {
|
||||||
|
ret = ollama_dl(model_, headers, bn);
|
||||||
|
}
|
||||||
|
|
||||||
|
model_ = bn;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initializes the model and returns a unique pointer to it
|
||||||
|
llama_model_ptr initialize_model(Opt & opt) {
|
||||||
|
ggml_backend_load_all();
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
|
||||||
|
resolve_model(opt.model_);
|
||||||
|
llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
|
||||||
if (!model) {
|
if (!model) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
return model;
|
return model;
|
||||||
|
@ -150,10 +480,9 @@ class LlamaData {
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
ctx_params.n_ctx = n_ctx;
|
ctx_params.n_ctx = n_ctx;
|
||||||
ctx_params.n_batch = n_ctx;
|
ctx_params.n_batch = n_ctx;
|
||||||
|
|
||||||
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
|
llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
|
||||||
if (!context) {
|
if (!context) {
|
||||||
fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
|
printe("%s: error: failed to create the llama_context\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
return context;
|
return context;
|
||||||
|
@ -170,23 +499,22 @@ class LlamaData {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Add a message to `messages` and store its content in `owned_content`
|
// Add a message to `messages` and store its content in `msg_strs`
|
||||||
static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
|
static void add_message(const char * role, const std::string & text, LlamaData & llama_data) {
|
||||||
std::vector<char_array_ptr> & owned_content) {
|
llama_data.msg_strs.push_back(std::move(text));
|
||||||
char_array_ptr content(new char[text.size() + 1]);
|
llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() });
|
||||||
std::strcpy(content.get(), text.c_str());
|
|
||||||
llama_data.messages.push_back({role, content.get()});
|
|
||||||
owned_content.push_back(std::move(content));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to apply the chat template and resize `formatted` if needed
|
// Function to apply the chat template and resize `formatted` if needed
|
||||||
static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
|
static int apply_chat_template(LlamaData & llama_data, const bool append) {
|
||||||
int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
int result = llama_chat_apply_template(
|
||||||
llama_data.messages.size(), append, formatted.data(), formatted.size());
|
llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append,
|
||||||
if (result > static_cast<int>(formatted.size())) {
|
append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
|
||||||
formatted.resize(result);
|
if (append && result > static_cast<int>(llama_data.fmtted.size())) {
|
||||||
|
llama_data.fmtted.resize(result);
|
||||||
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
|
||||||
llama_data.messages.size(), append, formatted.data(), formatted.size());
|
llama_data.messages.size(), append, llama_data.fmtted.data(),
|
||||||
|
llama_data.fmtted.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -199,7 +527,8 @@ static int tokenize_prompt(const llama_model_ptr & model, const std::string & pr
|
||||||
prompt_tokens.resize(n_prompt_tokens);
|
prompt_tokens.resize(n_prompt_tokens);
|
||||||
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
|
if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
|
||||||
true) < 0) {
|
true) < 0) {
|
||||||
GGML_ABORT("failed to tokenize the prompt\n");
|
printe("failed to tokenize the prompt\n");
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return n_prompt_tokens;
|
return n_prompt_tokens;
|
||||||
|
@ -211,7 +540,7 @@ static int check_context_size(const llama_context_ptr & ctx, const llama_batch &
|
||||||
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
|
const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
|
||||||
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||||
printf("\033[0m\n");
|
printf("\033[0m\n");
|
||||||
fprintf(stderr, "context size exceeded\n");
|
printe("context size exceeded\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,7 +552,8 @@ static int convert_token_to_string(const llama_model_ptr & model, const llama_to
|
||||||
char buf[256];
|
char buf[256];
|
||||||
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
|
int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
|
||||||
if (n < 0) {
|
if (n < 0) {
|
||||||
GGML_ABORT("failed to convert token to piece\n");
|
printe("failed to convert token to piece\n");
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
piece = std::string(buf, n);
|
piece = std::string(buf, n);
|
||||||
|
@ -238,19 +568,19 @@ static void print_word_and_concatenate_to_response(const std::string & piece, st
|
||||||
|
|
||||||
// helper function to evaluate a prompt and generate a response
|
// helper function to evaluate a prompt and generate a response
|
||||||
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
|
static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> tokens;
|
||||||
const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
|
if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) {
|
||||||
if (n_prompt_tokens < 0) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// prepare a batch for the prompt
|
// prepare a batch for the prompt
|
||||||
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
|
||||||
llama_token new_token_id;
|
llama_token new_token_id;
|
||||||
while (true) {
|
while (true) {
|
||||||
check_context_size(llama_data.context, batch);
|
check_context_size(llama_data.context, batch);
|
||||||
if (llama_decode(llama_data.context.get(), batch)) {
|
if (llama_decode(llama_data.context.get(), batch)) {
|
||||||
GGML_ABORT("failed to decode\n");
|
printe("failed to decode\n");
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// sample the next token, check is it an end of generation?
|
// sample the next token, check is it an end of generation?
|
||||||
|
@ -273,22 +603,9 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int parse_arguments(const int argc, const char ** argv, Options & opt) {
|
|
||||||
ArgumentParser parser(argv[0]);
|
|
||||||
parser.add_argument("-m", opt.model_path, "model");
|
|
||||||
parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
|
|
||||||
parser.add_argument("-c", opt.n_ctx, "context_size");
|
|
||||||
parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
|
|
||||||
if (parser.parse(argc, argv)) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int read_user_input(std::string & user) {
|
static int read_user_input(std::string & user) {
|
||||||
std::getline(std::cin, user);
|
std::getline(std::cin, user);
|
||||||
return user.empty(); // Indicate an error or empty input
|
return user.empty(); // Should have data in happy path
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to generate a response based on the prompt
|
// Function to generate a response based on the prompt
|
||||||
|
@ -296,7 +613,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
|
||||||
// Set response color
|
// Set response color
|
||||||
printf("\033[33m");
|
printf("\033[33m");
|
||||||
if (generate(llama_data, prompt, response)) {
|
if (generate(llama_data, prompt, response)) {
|
||||||
fprintf(stderr, "failed to generate response\n");
|
printe("failed to generate response\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -306,11 +623,10 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to apply the chat template and handle errors
|
// Helper function to apply the chat template and handle errors
|
||||||
static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
|
static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
|
||||||
const bool is_user_input, int & output_length) {
|
const int new_len = apply_chat_template(llama_data, append);
|
||||||
const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
|
|
||||||
if (new_len < 0) {
|
if (new_len < 0) {
|
||||||
fprintf(stderr, "failed to apply the chat template\n");
|
printe("failed to apply the chat template\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -319,49 +635,56 @@ static int apply_chat_template_with_error_handling(const LlamaData & llama_data,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to handle user input
|
// Helper function to handle user input
|
||||||
static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
|
static int handle_user_input(std::string & user_input, const std::string & user_) {
|
||||||
if (!prompt_non_interactive.empty()) {
|
if (!user_.empty()) {
|
||||||
user_input = prompt_non_interactive;
|
user_input = user_;
|
||||||
return true; // No need for interactive input
|
return 0; // No need for interactive input
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\033[32m> \033[0m");
|
printf(
|
||||||
return !read_user_input(user_input); // Returns false if input ends the loop
|
"\r "
|
||||||
|
"\r\033[32m> \033[0m");
|
||||||
|
return read_user_input(user_input); // Returns true if input ends the loop
|
||||||
}
|
}
|
||||||
|
|
||||||
// Function to tokenize the prompt
|
// Function to tokenize the prompt
|
||||||
static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
|
static int chat_loop(LlamaData & llama_data, const std::string & user_) {
|
||||||
std::vector<char_array_ptr> owned_content;
|
|
||||||
std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
|
|
||||||
int prev_len = 0;
|
int prev_len = 0;
|
||||||
|
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
|
||||||
while (true) {
|
while (true) {
|
||||||
// Get user input
|
// Get user input
|
||||||
std::string user_input;
|
std::string user_input;
|
||||||
if (!handle_user_input(user_input, prompt_non_interactive)) {
|
while (handle_user_input(user_input, user_)) {
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
|
add_message("user", user_.empty() ? user_input : user_, llama_data);
|
||||||
owned_content);
|
|
||||||
|
|
||||||
int new_len;
|
int new_len;
|
||||||
if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
|
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
|
std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
|
||||||
std::string response;
|
std::string response;
|
||||||
if (generate_response(llama_data, prompt, response)) {
|
if (generate_response(llama_data, prompt, response)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!user_.empty()) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
add_message("assistant", response, llama_data);
|
||||||
|
if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void log_callback(const enum ggml_log_level level, const char * text, void *) {
|
static void log_callback(const enum ggml_log_level level, const char * text, void *) {
|
||||||
if (level == GGML_LOG_LEVEL_ERROR) {
|
if (level == GGML_LOG_LEVEL_ERROR) {
|
||||||
fprintf(stderr, "%s", text);
|
printe("%s", text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -382,17 +705,20 @@ static std::string read_pipe_data() {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, const char ** argv) {
|
int main(int argc, const char ** argv) {
|
||||||
Options opt;
|
Opt opt;
|
||||||
if (parse_arguments(argc, argv, opt)) {
|
const int ret = opt.init(argc, argv);
|
||||||
|
if (ret == 2) {
|
||||||
|
return 0;
|
||||||
|
} else if (ret) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_stdin_a_terminal()) {
|
if (!is_stdin_a_terminal()) {
|
||||||
if (!opt.prompt_non_interactive.empty()) {
|
if (!opt.user_.empty()) {
|
||||||
opt.prompt_non_interactive += "\n\n";
|
opt.user_ += "\n\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
opt.prompt_non_interactive += read_pipe_data();
|
opt.user_ += read_pipe_data();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_log_set(log_callback, nullptr);
|
llama_log_set(log_callback, nullptr);
|
||||||
|
@ -401,7 +727,7 @@ int main(int argc, const char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chat_loop(llama_data, opt.prompt_non_interactive)) {
|
if (chat_loop(llama_data, opt.user_)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,8 +62,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||||
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
||||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||||
|
@ -138,6 +138,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| -------- | ----------- |
|
| -------- | ----------- |
|
||||||
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||||
|
| `--no-warmup` | skip warming up the model with an empty run |
|
||||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||||
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
|
@ -146,7 +147,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||||
| `--no-webui` | disable the Web UI<br/>(env: LLAMA_ARG_NO_WEBUI) |
|
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
|
||||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||||
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||||
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||||
|
@ -164,13 +165,13 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>list of built-in templates:<br/>chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, exaone3, gemma, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, monarch, openchat, orion, phi3, rwkv-world, vicuna, vicuna-orca, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||||
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16) |
|
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5) |
|
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 5)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
||||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9) |
|
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.9)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||||
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model) |
|
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
|
||||||
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
| `-devd, --device-draft <dev1,dev2,..>` | comma-separated list of devices to use for offloading the draft model (none = don't offload)<br/>use --list-devices to see a list of available devices |
|
||||||
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
|
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | number of layers to store in VRAM for the draft model<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||||
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
|
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
|
||||||
|
|
||||||
|
|
||||||
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||||
|
|
|
@ -1079,9 +1079,9 @@ struct server_slot {
|
||||||
|
|
||||||
SLT_INF(*this,
|
SLT_INF(*this,
|
||||||
"\n"
|
"\n"
|
||||||
"\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
||||||
"\r eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
" eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
|
||||||
"\r total time = %10.2f ms / %5d tokens\n",
|
" total time = %10.2f ms / %5d tokens\n",
|
||||||
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
|
||||||
t_token_generation, n_decoded, t_gen, n_gen_second,
|
t_token_generation, n_decoded, t_gen, n_gen_second,
|
||||||
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
|
||||||
|
|
|
@ -394,7 +394,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (show_token_count) {
|
if (show_token_count) {
|
||||||
printf("Total number of tokens: %ld\n", tokens.size());
|
printf("Total number of tokens: %zu\n", tokens.size());
|
||||||
}
|
}
|
||||||
// silence valgrind
|
// silence valgrind
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
|
@ -32,6 +32,13 @@ else()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# remove the lib prefix on win32 mingw
|
||||||
|
if (WIN32)
|
||||||
|
set(CMAKE_STATIC_LIBRARY_PREFIX "")
|
||||||
|
set(CMAKE_SHARED_LIBRARY_PREFIX "")
|
||||||
|
set(CMAKE_SHARED_MODULE_PREFIX "")
|
||||||
|
endif()
|
||||||
|
|
||||||
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||||
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
||||||
|
|
||||||
|
@ -172,6 +179,11 @@ set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||||
"ggml: sycl device architecture")
|
"ggml: sycl device architecture")
|
||||||
|
|
||||||
|
option(GGML_OPENCL "ggml: use OpenCL" OFF)
|
||||||
|
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||||
|
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||||
|
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
||||||
|
|
||||||
# extra artifacts
|
# extra artifacts
|
||||||
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
||||||
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
||||||
|
|
26
ggml/include/ggml-opencl.h
Normal file
26
ggml/include/ggml-opencl.h
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
#ifndef GGML_OPENCL_H
|
||||||
|
#define GGML_OPENCL_H
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// backend API
|
||||||
|
//
|
||||||
|
GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
|
||||||
|
GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
|
||||||
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
|
||||||
|
|
||||||
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // GGML_OPENCL_H
|
|
@ -238,6 +238,8 @@
|
||||||
#define GGML_EXIT_ABORTED 1
|
#define GGML_EXIT_ABORTED 1
|
||||||
|
|
||||||
#define GGML_ROPE_TYPE_NEOX 2
|
#define GGML_ROPE_TYPE_NEOX 2
|
||||||
|
#define GGML_ROPE_TYPE_MROPE 8
|
||||||
|
#define GGML_ROPE_TYPE_VISION 24
|
||||||
|
|
||||||
#define GGUF_MAGIC "GGUF"
|
#define GGUF_MAGIC "GGUF"
|
||||||
|
|
||||||
|
@ -1443,6 +1445,22 @@ extern "C" {
|
||||||
float beta_fast,
|
float beta_fast,
|
||||||
float beta_slow);
|
float beta_slow);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_rope_multi(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
int n_dims,
|
||||||
|
int sections[4],
|
||||||
|
int mode,
|
||||||
|
int n_ctx_orig,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow);
|
||||||
|
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
|
|
@ -194,11 +194,6 @@ endif()
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
|
||||||
# TODO: should not use this
|
|
||||||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# ggml
|
# ggml
|
||||||
|
@ -313,6 +308,7 @@ ggml_add_backend(MUSA)
|
||||||
ggml_add_backend(RPC)
|
ggml_add_backend(RPC)
|
||||||
ggml_add_backend(SYCL)
|
ggml_add_backend(SYCL)
|
||||||
ggml_add_backend(Vulkan)
|
ggml_add_backend(Vulkan)
|
||||||
|
ggml_add_backend(OpenCL)
|
||||||
|
|
||||||
foreach (target ggml-base ggml)
|
foreach (target ggml-base ggml)
|
||||||
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
target_include_directories(${target} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
|
||||||
|
|
|
@ -46,6 +46,10 @@
|
||||||
#include "ggml-vulkan.h"
|
#include "ggml-vulkan.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_OPENCL
|
||||||
|
#include "ggml-opencl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -146,6 +150,9 @@ struct ggml_backend_registry {
|
||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
register_backend(ggml_backend_vk_reg());
|
register_backend(ggml_backend_vk_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_OPENCL
|
||||||
|
register_backend(ggml_backend_opencl_reg());
|
||||||
|
#endif
|
||||||
#ifdef GGML_USE_CANN
|
#ifdef GGML_USE_CANN
|
||||||
register_backend(ggml_backend_cann_reg());
|
register_backend(ggml_backend_cann_reg());
|
||||||
#endif
|
#endif
|
||||||
|
@ -473,7 +480,8 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||||
if (!fs::exists(search_path)) {
|
if (!fs::exists(search_path)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (const auto & entry : fs::directory_iterator(search_path)) {
|
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||||
|
for (const auto & entry : dir_it) {
|
||||||
if (entry.is_regular_file()) {
|
if (entry.is_regular_file()) {
|
||||||
std::string filename = entry.path().filename().string();
|
std::string filename = entry.path().filename().string();
|
||||||
std::string ext = entry.path().extension().string();
|
std::string ext = entry.path().extension().string();
|
||||||
|
@ -538,6 +546,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
|
||||||
ggml_backend_load_best("rpc", silent, dir_path);
|
ggml_backend_load_best("rpc", silent, dir_path);
|
||||||
ggml_backend_load_best("sycl", silent, dir_path);
|
ggml_backend_load_best("sycl", silent, dir_path);
|
||||||
ggml_backend_load_best("vulkan", silent, dir_path);
|
ggml_backend_load_best("vulkan", silent, dir_path);
|
||||||
|
ggml_backend_load_best("opencl", silent, dir_path);
|
||||||
ggml_backend_load_best("musa", silent, dir_path);
|
ggml_backend_load_best("musa", silent, dir_path);
|
||||||
ggml_backend_load_best("cpu", silent, dir_path);
|
ggml_backend_load_best("cpu", silent, dir_path);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||||
if (*ext_factor != 0) {
|
if (*ext_factor != 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const int mode = ((const int32_t *) op->op_params)[2];
|
||||||
|
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
case GGML_OP_UPSCALE: {
|
case GGML_OP_UPSCALE: {
|
||||||
|
|
|
@ -122,7 +122,7 @@ static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_ty
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
|
void * data = ggml_aligned_malloc(size);
|
||||||
if (data == NULL) {
|
if (data == NULL) {
|
||||||
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
|
@ -126,8 +126,7 @@ struct ggml_arm_arch_features_type {
|
||||||
#endif
|
#endif
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER) && !defined(__clang__)
|
||||||
#if !defined(__clang__)
|
|
||||||
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
|
#define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
|
||||||
|
|
||||||
typedef volatile LONG atomic_int;
|
typedef volatile LONG atomic_int;
|
||||||
|
@ -469,7 +468,7 @@ const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type
|
||||||
for (int i = 0; i < offset; ++i) { \
|
for (int i = 0; i < offset; ++i) { \
|
||||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||||
} \
|
} \
|
||||||
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
(res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GGML_F32_VEC GGML_F32x4
|
#define GGML_F32_VEC GGML_F32x4
|
||||||
|
@ -2396,7 +2395,7 @@ static void ggml_init_arm_arch_features(void) {
|
||||||
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
||||||
|
|
||||||
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
||||||
ggml_arm_arch_features.has_dotprod = !!(hwcap && HWCAP_ASIMDDP);
|
ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
||||||
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
||||||
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
||||||
|
|
||||||
|
@ -9134,6 +9133,64 @@ static void ggml_rope_cache_init(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_mrope_cache_init(
|
||||||
|
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
|
||||||
|
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
||||||
|
float * cache, float sin_sign, float theta_scale) {
|
||||||
|
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||||
|
float theta_t = theta_base_t;
|
||||||
|
float theta_h = theta_base_h;
|
||||||
|
float theta_w = theta_base_w;
|
||||||
|
float theta_e = theta_base_e; // extra position id for vision encoder
|
||||||
|
int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
|
||||||
|
int sec_w = sections[1] + sections[0];
|
||||||
|
int sec_e = sections[2] + sec_w;
|
||||||
|
GGML_ASSERT(sect_dims <= ne0);
|
||||||
|
|
||||||
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||||
|
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
|
||||||
|
|
||||||
|
int sector = (i0 / 2) % sect_dims;
|
||||||
|
if (indep_sects) {
|
||||||
|
// compute theta independently for each dim sections
|
||||||
|
// (i.e. reset corresponding theta when `i0` go from one section to another)
|
||||||
|
if (sector == 0) {
|
||||||
|
theta_t = theta_base_t;
|
||||||
|
}
|
||||||
|
else if (sector == sections[0]) {
|
||||||
|
theta_h = theta_base_h;;
|
||||||
|
}
|
||||||
|
else if (sector == sec_w) {
|
||||||
|
theta_w = theta_base_w;
|
||||||
|
}
|
||||||
|
else if (sector == sec_e) {
|
||||||
|
theta_e = theta_base_e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float theta = theta_t;
|
||||||
|
if (sector >= sections[0] && sector < sec_w) {
|
||||||
|
theta = theta_h;
|
||||||
|
}
|
||||||
|
else if (sector >= sec_w && sector < sec_w + sections[2]) {
|
||||||
|
theta = theta_w;
|
||||||
|
}
|
||||||
|
else if (sector >= sec_w + sections[2]) {
|
||||||
|
theta = theta_e;
|
||||||
|
}
|
||||||
|
|
||||||
|
rope_yarn(
|
||||||
|
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
||||||
|
);
|
||||||
|
cache[i0 + 1] *= sin_sign;
|
||||||
|
|
||||||
|
theta_t *= theta_scale;
|
||||||
|
theta_w *= theta_scale;
|
||||||
|
theta_h *= theta_scale;
|
||||||
|
theta_e *= theta_scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_f32(
|
static void ggml_compute_forward_rope_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
struct ggml_tensor * dst,
|
struct ggml_tensor * dst,
|
||||||
|
@ -9144,6 +9201,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
const struct ggml_tensor * src2 = dst->src[2];
|
const struct ggml_tensor * src2 = dst->src[2];
|
||||||
|
|
||||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||||
|
int sections[4];
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||||
|
@ -9157,6 +9215,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
|
@ -9189,6 +9248,16 @@ static void ggml_compute_forward_rope_f32(
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
|
||||||
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||||
|
|
||||||
|
if (is_mrope) {
|
||||||
|
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_vision) {
|
||||||
|
GGML_ASSERT(n_dims == ne0/2);
|
||||||
|
}
|
||||||
|
|
||||||
const float * freq_factors = NULL;
|
const float * freq_factors = NULL;
|
||||||
if (src2 != NULL) {
|
if (src2 != NULL) {
|
||||||
|
@ -9204,30 +9273,44 @@ static void ggml_compute_forward_rope_f32(
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
const int32_t * pos = (const int32_t *) src1->data;
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
||||||
const int64_t p = pos[i2];
|
|
||||||
|
|
||||||
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
||||||
|
if (!is_mrope) {
|
||||||
|
const int64_t p = pos[i2];
|
||||||
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int64_t p_t = pos[i2];
|
||||||
|
const int64_t p_h = pos[i2 + ne2];
|
||||||
|
const int64_t p_w = pos[i2 + ne2 * 2];
|
||||||
|
const int64_t p_e = pos[i2 + ne2 * 3];
|
||||||
|
ggml_mrope_cache_init(
|
||||||
|
p_t, p_h, p_w, p_e, sections, is_vision,
|
||||||
|
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
||||||
|
}
|
||||||
|
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (is_neox || is_mrope) {
|
||||||
|
if (is_vision){
|
||||||
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
|
const int64_t ic = i0/2;
|
||||||
|
|
||||||
const float cos_theta = cache[i0 + 0];
|
const float cos_theta = cache[i0 + 0];
|
||||||
const float sin_theta = cache[i0 + 1];
|
const float sin_theta = cache[i0 + 1];
|
||||||
|
|
||||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
||||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
||||||
|
|
||||||
const float x0 = src[0];
|
const float x0 = src[0];
|
||||||
const float x1 = src[1];
|
const float x1 = src[n_dims];
|
||||||
|
|
||||||
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||||
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
|
@ -9246,7 +9329,40 @@ static void ggml_compute_forward_rope_f32(
|
||||||
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
|
const float cos_theta = cache[i0 + 0];
|
||||||
|
const float sin_theta = cache[i0 + 1];
|
||||||
|
|
||||||
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
const float x0 = src[0];
|
||||||
|
const float x1 = src[1];
|
||||||
|
|
||||||
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_vision) {
|
||||||
|
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
||||||
|
const int64_t ic = i0/2;
|
||||||
|
|
||||||
|
const float cos_theta = cache[i0 + 0];
|
||||||
|
const float sin_theta = cache[i0 + 1];
|
||||||
|
|
||||||
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
||||||
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
||||||
|
|
||||||
|
const float x0 = src[0];
|
||||||
|
const float x1 = src[n_dims];
|
||||||
|
|
||||||
|
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// fill the remain channels with data from src tensor
|
||||||
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
||||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -9258,6 +9374,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: deduplicate f16/f32 code
|
// TODO: deduplicate f16/f32 code
|
||||||
static void ggml_compute_forward_rope_f16(
|
static void ggml_compute_forward_rope_f16(
|
||||||
|
@ -9270,6 +9387,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
const struct ggml_tensor * src2 = dst->src[2];
|
const struct ggml_tensor * src2 = dst->src[2];
|
||||||
|
|
||||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||||
|
int sections[4];
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||||
|
@ -9282,6 +9400,8 @@ static void ggml_compute_forward_rope_f16(
|
||||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
||||||
|
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
|
@ -9314,6 +9434,16 @@ static void ggml_compute_forward_rope_f16(
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||||
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||||
|
|
||||||
|
if (is_mrope) {
|
||||||
|
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_vision) {
|
||||||
|
GGML_ASSERT(n_dims == ne0/2);
|
||||||
|
}
|
||||||
|
|
||||||
const float * freq_factors = NULL;
|
const float * freq_factors = NULL;
|
||||||
if (src2 != NULL) {
|
if (src2 != NULL) {
|
||||||
|
@ -9331,28 +9461,42 @@ static void ggml_compute_forward_rope_f16(
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||||
const int64_t p = pos[i2];
|
|
||||||
|
|
||||||
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
||||||
|
if (!is_mrope) {
|
||||||
|
const int64_t p = pos[i2];
|
||||||
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const int64_t p_t = pos[i2];
|
||||||
|
const int64_t p_h = pos[i2 + ne2];
|
||||||
|
const int64_t p_w = pos[i2 + ne2 * 2];
|
||||||
|
const int64_t p_e = pos[i2 + ne2 * 3];
|
||||||
|
ggml_mrope_cache_init(
|
||||||
|
p_t, p_h, p_w, p_e, sections, is_vision,
|
||||||
|
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
||||||
|
}
|
||||||
|
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
if (ir++ < ir0) continue;
|
if (ir++ < ir0) continue;
|
||||||
if (ir > ir1) break;
|
if (ir > ir1) break;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (is_neox || is_mrope) {
|
||||||
|
if (is_vision) {
|
||||||
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
|
const int64_t ic = i0/2;
|
||||||
|
|
||||||
const float cos_theta = cache[i0 + 0];
|
const float cos_theta = cache[i0 + 0];
|
||||||
const float sin_theta = cache[i0 + 1];
|
const float sin_theta = cache[i0 + 1];
|
||||||
|
|
||||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
||||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
||||||
|
|
||||||
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||||
const float x1 = GGML_FP16_TO_FP32(src[1]);
|
const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
|
||||||
|
|
||||||
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||||
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
|
@ -9371,7 +9515,39 @@ static void ggml_compute_forward_rope_f16(
|
||||||
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
||||||
|
const float cos_theta = cache[i0 + 0];
|
||||||
|
const float sin_theta = cache[i0 + 1];
|
||||||
|
|
||||||
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||||
|
const float x1 = GGML_FP16_TO_FP32(src[1]);
|
||||||
|
|
||||||
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||||
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_vision) {
|
||||||
|
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
||||||
|
const int64_t ic = i0/2;
|
||||||
|
|
||||||
|
const float cos_theta = cache[i0 + 0];
|
||||||
|
const float sin_theta = cache[i0 + 1];
|
||||||
|
|
||||||
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
||||||
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
||||||
|
|
||||||
|
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
||||||
|
const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
|
||||||
|
|
||||||
|
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
||||||
|
dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
||||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -9383,6 +9559,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_compute_forward_rope(
|
static void ggml_compute_forward_rope(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
|
@ -12945,7 +13122,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
|
||||||
#include "windows.h"
|
#include "windows.h"
|
||||||
|
|
||||||
// TODO: support > 64 CPUs
|
// TODO: support > 64 CPUs
|
||||||
bool ggml_thread_apply_affinity(bool * mask) {
|
static bool ggml_thread_apply_affinity(bool * mask) {
|
||||||
HANDLE h = GetCurrentThread();
|
HANDLE h = GetCurrentThread();
|
||||||
uint64_t bitmask = 0ULL;
|
uint64_t bitmask = 0ULL;
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,9 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n
|
||||||
}
|
}
|
||||||
|
|
||||||
// non-contiguous kernel (slow)
|
// non-contiguous kernel (slow)
|
||||||
static __global__ void concat_f32_non_cont(
|
template <int dim>
|
||||||
|
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||||
|
concat_f32_non_cont(
|
||||||
const char * src0,
|
const char * src0,
|
||||||
const char * src1,
|
const char * src1,
|
||||||
char * dst,
|
char * dst,
|
||||||
|
@ -121,22 +123,28 @@ static __global__ void concat_f32_non_cont(
|
||||||
uint64_t nb0,
|
uint64_t nb0,
|
||||||
uint64_t nb1,
|
uint64_t nb1,
|
||||||
uint64_t nb2,
|
uint64_t nb2,
|
||||||
uint64_t nb3,
|
uint64_t nb3){
|
||||||
int32_t dim) {
|
static_assert(dim >= 0 && dim <= 3);
|
||||||
|
|
||||||
const int64_t i3 = blockIdx.z;
|
const int64_t i3 = blockIdx.z;
|
||||||
const int64_t i2 = blockIdx.y;
|
const int64_t i2 = blockIdx.y;
|
||||||
const int64_t i1 = blockIdx.x;
|
const int64_t i1 = blockIdx.x;
|
||||||
|
|
||||||
int64_t o[4] = {0, 0, 0, 0};
|
|
||||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
|
||||||
|
|
||||||
const float * x;
|
const float * x;
|
||||||
|
|
||||||
for (int i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) {
|
||||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||||
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||||
} else {
|
} else {
|
||||||
x = (const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
if constexpr (dim == 0) {
|
||||||
|
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10);
|
||||||
|
} else if constexpr (dim == 1) {
|
||||||
|
x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10);
|
||||||
|
} else if constexpr (dim == 2) {
|
||||||
|
x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10);
|
||||||
|
} else if constexpr (dim == 3) {
|
||||||
|
x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
@ -182,15 +190,32 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
|
||||||
concat_f32_non_cont<<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
auto launch_kernel = [&](auto dim) {
|
||||||
(const char *)src0->data,
|
concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
|
||||||
(const char *)src1->data,
|
(const char *) src0->data, (const char *) src1->data, (char *) dst->data,
|
||||||
( char *)dst->data,
|
|
||||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||||
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||||
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
|
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3],
|
||||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||||
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
|
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
|
||||||
|
};
|
||||||
|
switch (dim) {
|
||||||
|
case 0:
|
||||||
|
launch_kernel(std::integral_constant<int, 0>{});
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
launch_kernel(std::integral_constant<int, 1>{});
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
launch_kernel(std::integral_constant<int, 2>{});
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
launch_kernel(std::integral_constant<int, 3>{});
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("Invalid dim: %d", dim);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,11 @@ struct rope_corr_dims {
|
||||||
float v[2];
|
float v[2];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct mrope_sections {
|
||||||
|
int v[4];
|
||||||
|
};
|
||||||
|
|
||||||
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||||
return 1.0f - min(1.0f, max(0.0f, y));
|
return 1.0f - min(1.0f, max(0.0f, y));
|
||||||
|
@ -108,6 +113,105 @@ static __global__ void rope_neox(
|
||||||
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T, bool has_ff>
|
||||||
|
static __global__ void rope_multi(
|
||||||
|
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
|
||||||
|
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||||
|
|
||||||
|
if (i0 >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i0 >= n_dims) {
|
||||||
|
const int i = row*ne0 + i0;
|
||||||
|
|
||||||
|
dst[i + 0] = x[i + 0];
|
||||||
|
dst[i + 1] = x[i + 1];
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int i = row*ne0 + i0/2;
|
||||||
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
|
int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
|
||||||
|
int sec_w = sections.v[1] + sections.v[0];
|
||||||
|
int sector = (i0 / 2) % sect_dims;
|
||||||
|
|
||||||
|
float theta_base = 0.0;
|
||||||
|
if (sector < sections.v[0]) {
|
||||||
|
theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
||||||
|
}
|
||||||
|
else if (sector >= sections.v[0] && sector < sec_w) {
|
||||||
|
theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
|
||||||
|
}
|
||||||
|
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
|
||||||
|
theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
|
||||||
|
}
|
||||||
|
else if (sector >= sec_w + sections.v[2]) {
|
||||||
|
theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||||
|
|
||||||
|
float cos_theta;
|
||||||
|
float sin_theta;
|
||||||
|
|
||||||
|
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
|
const float x0 = x[i + 0];
|
||||||
|
const float x1 = x[i + n_dims/2];
|
||||||
|
|
||||||
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T, bool has_ff>
|
||||||
|
static __global__ void rope_vision(
|
||||||
|
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
|
||||||
|
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||||
|
|
||||||
|
if (i0 >= ne0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
const int i = row*ne0 + i0/2;
|
||||||
|
const int i2 = row/p_delta_rows; // i2-th tokens
|
||||||
|
|
||||||
|
int sect_dims = sections.v[0] + sections.v[1];
|
||||||
|
int sec_w = sections.v[1] + sections.v[0];
|
||||||
|
int sector = (i0 / 2) % sect_dims;
|
||||||
|
|
||||||
|
float theta_base = 0.0;
|
||||||
|
if (sector < sections.v[0]) {
|
||||||
|
const int p = sector;
|
||||||
|
theta_base = pos[i2]*powf(theta_scale, p);
|
||||||
|
}
|
||||||
|
else if (sector >= sections.v[0] && sector < sec_w) {
|
||||||
|
const int p = sector - sections.v[0];
|
||||||
|
theta_base = pos[i2 + ne2]*powf(theta_scale, p);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||||
|
|
||||||
|
float cos_theta;
|
||||||
|
float sin_theta;
|
||||||
|
|
||||||
|
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
|
const float x0 = x[i + 0];
|
||||||
|
const float x1 = x[i + n_dims];
|
||||||
|
|
||||||
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
|
dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static void rope_norm_cuda(
|
static void rope_norm_cuda(
|
||||||
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
@ -156,6 +260,56 @@ static void rope_neox_cuda(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
static void rope_multi_cuda(
|
||||||
|
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ne0 % 2 == 0);
|
||||||
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||||
|
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||||
|
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||||
|
|
||||||
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
|
||||||
|
if (freq_factors == nullptr) {
|
||||||
|
rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
|
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
|
theta_scale, freq_factors, sections
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
|
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
|
theta_scale, freq_factors, sections
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
static void rope_vision_cuda(
|
||||||
|
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ne0 % 2 == 0);
|
||||||
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||||
|
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||||
|
const dim3 block_nums(nr, n_blocks_x, 1);
|
||||||
|
// break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
|
||||||
|
// where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
|
||||||
|
|
||||||
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
|
||||||
|
if (freq_factors == nullptr) {
|
||||||
|
rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
|
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
|
theta_scale, freq_factors, sections
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
|
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
|
theta_scale, freq_factors, sections
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void rope_norm_cuda_f16(
|
static void rope_norm_cuda_f16(
|
||||||
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
|
||||||
|
@ -185,6 +339,38 @@ static void rope_neox_cuda_f32(
|
||||||
rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
|
rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void rope_multi_cuda_f16(
|
||||||
|
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
||||||
|
) {
|
||||||
|
|
||||||
|
rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rope_multi_cuda_f32(
|
||||||
|
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
||||||
|
) {
|
||||||
|
|
||||||
|
rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rope_vision_cuda_f16(
|
||||||
|
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
||||||
|
) {
|
||||||
|
|
||||||
|
rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void rope_vision_cuda_f32(
|
||||||
|
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
||||||
|
) {
|
||||||
|
|
||||||
|
rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const ggml_tensor * src1 = dst->src[1];
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
@ -201,8 +387,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src0->type == dst->type);
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0]; // head dims
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1]; // num heads
|
||||||
|
const int64_t ne02 = src0->ne[2]; // num heads
|
||||||
const int64_t nr = ggml_nrows(src0);
|
const int64_t nr = ggml_nrows(src0);
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
|
@ -210,6 +397,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
||||||
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
||||||
|
mrope_sections sections;
|
||||||
|
|
||||||
// RoPE alteration for extended context
|
// RoPE alteration for extended context
|
||||||
float freq_base;
|
float freq_base;
|
||||||
|
@ -225,8 +413,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
||||||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||||
|
memcpy(§ions.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
||||||
|
|
||||||
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
||||||
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||||
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||||
|
|
||||||
|
if (is_mrope) {
|
||||||
|
GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_vision) {
|
||||||
|
GGML_ASSERT(n_dims == ne00/2);
|
||||||
|
}
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1_d;
|
const int32_t * pos = (const int32_t *) src1_d;
|
||||||
|
|
||||||
|
@ -253,6 +452,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
} else if (is_mrope && !is_vision) {
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
rope_multi_cuda_f32(
|
||||||
|
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
|
attn_factor, corr_dims, freq_factors, sections, stream
|
||||||
|
);
|
||||||
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
|
rope_multi_cuda_f16(
|
||||||
|
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
|
attn_factor, corr_dims, freq_factors, sections, stream
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
} else if (is_vision) {
|
||||||
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
rope_vision_cuda_f32(
|
||||||
|
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
|
attn_factor, corr_dims, freq_factors, sections, stream
|
||||||
|
);
|
||||||
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
|
rope_vision_cuda_f16(
|
||||||
|
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
|
attn_factor, corr_dims, freq_factors, sections, stream
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
rope_norm_cuda_f32(
|
rope_norm_cuda_f32(
|
||||||
|
|
|
@ -74,8 +74,8 @@ static inline int ggml_up(int n, int m) {
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||||
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
GGML_API void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
||||||
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
GGML_API void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
||||||
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||||
|
@ -304,8 +304,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
|
|
||||||
// Memory allocation
|
// Memory allocation
|
||||||
|
|
||||||
void * ggml_aligned_malloc(size_t size);
|
GGML_API void * ggml_aligned_malloc(size_t size);
|
||||||
void ggml_aligned_free(void * ptr, size_t size);
|
GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
||||||
|
|
||||||
// FP16 to FP32 conversion
|
// FP16 to FP32 conversion
|
||||||
|
|
||||||
|
|
|
@ -1419,8 +1419,18 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_ROPE:
|
|
||||||
return true;
|
return true;
|
||||||
|
case GGML_OP_ROPE:
|
||||||
|
{
|
||||||
|
const int mode = ((const int32_t *) op->op_params)[2];
|
||||||
|
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
|
|
|
@ -1125,8 +1125,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||||
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
|
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
|
||||||
case GGML_OP_ARGMAX:
|
case GGML_OP_ARGMAX:
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_ROPE:
|
|
||||||
return true;
|
return true;
|
||||||
|
case GGML_OP_ROPE:
|
||||||
|
{
|
||||||
|
const int mode = ((const int32_t *) op->op_params)[2];
|
||||||
|
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
return op->src[0]->type == GGML_TYPE_F16;
|
return op->src[0]->type == GGML_TYPE_F16;
|
||||||
case GGML_OP_POOL_1D:
|
case GGML_OP_POOL_1D:
|
||||||
|
@ -3026,7 +3036,9 @@ static void ggml_metal_encode_node(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne10 == ne02);
|
// make sure we have one or more position id(ne10) per token(ne02)
|
||||||
|
GGML_ASSERT(ne10 % ne02 == 0);
|
||||||
|
GGML_ASSERT(ne10 >= ne02);
|
||||||
|
|
||||||
const int nth = MIN(1024, ne00);
|
const int nth = MIN(1024, ne00);
|
||||||
|
|
||||||
|
|
147
ggml/src/ggml-opencl/CMakeLists.txt
Normal file
147
ggml/src/ggml-opencl/CMakeLists.txt
Normal file
|
@ -0,0 +1,147 @@
|
||||||
|
find_package(OpenCL REQUIRED)
|
||||||
|
find_package(Python3 REQUIRED)
|
||||||
|
|
||||||
|
set(TARGET_NAME ggml-opencl)
|
||||||
|
|
||||||
|
ggml_add_backend_library(${TARGET_NAME}
|
||||||
|
ggml-opencl.cpp
|
||||||
|
../../include/ggml-opencl.h)
|
||||||
|
target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES})
|
||||||
|
target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS})
|
||||||
|
|
||||||
|
if (GGML_OPENCL_PROFILING)
|
||||||
|
message(STATUS "OpenCL profiling enabled (increases CPU overhead)")
|
||||||
|
add_compile_definitions(GGML_OPENCL_PROFILING)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_OPENCL_SOA_Q)
|
||||||
|
|
||||||
|
if (GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||||
|
message(STATUS "OpenCL will use matmul kernels optimized for Adreno")
|
||||||
|
add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (GGML_OPENCL_EMBED_KERNELS)
|
||||||
|
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
|
||||||
|
|
||||||
|
set(OPENCL_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h")
|
||||||
|
set(OPENCL_MM_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h")
|
||||||
|
set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
|
||||||
|
|
||||||
|
set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h")
|
||||||
|
set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
|
||||||
|
set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
|
||||||
|
set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
|
||||||
|
set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
|
||||||
|
set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
|
||||||
|
|
||||||
|
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
|
||||||
|
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
||||||
|
|
||||||
|
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
|
||||||
|
|
||||||
|
# Python must be accessible from command line
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_CL_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
|
||||||
|
${OPENCL_CL_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
|
||||||
|
${OPENCL_MM_CL_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_mm.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
|
||||||
|
${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_cvt.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
|
||||||
|
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
|
||||||
|
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
|
||||||
|
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
|
||||||
|
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_transpose_16.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
|
||||||
|
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_transpose_32.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(
|
||||||
|
OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
||||||
|
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
|
||||||
|
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
||||||
|
DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
|
||||||
|
COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
|
||||||
|
)
|
||||||
|
|
||||||
|
target_sources(${TARGET_NAME} PRIVATE
|
||||||
|
${OPENCL_CL_SOURCE_EMBED}
|
||||||
|
${OPENCL_MM_CL_SOURCE_EMBED}
|
||||||
|
${OPENCL_CVT_CL_SOURCE_EMBED}
|
||||||
|
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
||||||
|
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
||||||
|
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
||||||
|
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
||||||
|
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
||||||
|
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
|
||||||
|
else ()
|
||||||
|
# copy ggml-opencl.cl to bin directory
|
||||||
|
configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
|
||||||
|
|
||||||
|
configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
|
||||||
|
configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
|
||||||
|
endif ()
|
4004
ggml/src/ggml-opencl/ggml-opencl.cpp
Normal file
4004
ggml/src/ggml-opencl/ggml-opencl.cpp
Normal file
File diff suppressed because it is too large
Load diff
26
ggml/src/ggml-opencl/kernels/embed_kernel.py
Normal file
26
ggml/src/ggml-opencl/kernels/embed_kernel.py
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
#
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger("opencl-embed-kernel")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
logger.info("Usage: python embed_kernel.py <input_file> <output_file>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
ifile = open(sys.argv[1], "r")
|
||||||
|
ofile = open(sys.argv[2], "w")
|
||||||
|
|
||||||
|
for i in ifile:
|
||||||
|
ofile.write('R"({})"\n'.format(i))
|
||||||
|
|
||||||
|
ifile.close()
|
||||||
|
ofile.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
2683
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
Normal file
2683
ggml/src/ggml-opencl/kernels/ggml-opencl.cl
Normal file
File diff suppressed because it is too large
Load diff
106
ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
Normal file
106
ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// This file is contains additional kernels for data conversion.
|
||||||
|
// These kernels are used when loading the model, so its performance is less
|
||||||
|
// important.
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
#ifdef cl_khr_fp16
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#elif defined(cl_amd_fp16)
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp16 : enable
|
||||||
|
#else
|
||||||
|
#error "Half precision floating point not supportedby OpenCL implementation on your device."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef cl_khr_subgroups
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#elif defined(cl_intel_subgroups)
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||||
|
#else
|
||||||
|
#error "Subgroup not supported on your device."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef cl_intel_required_subgroup_size
|
||||||
|
// Always use subgroup size of 32 on Intel.
|
||||||
|
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
||||||
|
#define INTEL_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
||||||
|
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
||||||
|
#elif defined(cl_qcom_reqd_sub_group_size)
|
||||||
|
// Always use subgroups size of 64 on Adreno.
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
#define ADRENO_GPU 1
|
||||||
|
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||||
|
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
#else
|
||||||
|
// TODO: do not know how to choose subgroup size on other GPUs.
|
||||||
|
#error "Selecting subgroup size is not supported on your device."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define QK4_0 32
|
||||||
|
#define QR4_0 2
|
||||||
|
#define QK4_1 32
|
||||||
|
#define QR4_1 2
|
||||||
|
#define QK5_0 32
|
||||||
|
#define QR5_0 2
|
||||||
|
#define QK5_1 32
|
||||||
|
#define QR5_1 2
|
||||||
|
#define QK8_0 32
|
||||||
|
#define QR8_0 1
|
||||||
|
#define QK_K 256
|
||||||
|
#define K_QUANTS_PER_ITERATION 2
|
||||||
|
|
||||||
|
typedef char int8_t;
|
||||||
|
typedef uchar uint8_t;
|
||||||
|
typedef short int16_t;
|
||||||
|
typedef ushort uint16_t;
|
||||||
|
typedef int int32_t;
|
||||||
|
typedef uint uint32_t;
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// block_q4_0
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
struct block_q4_0
|
||||||
|
{
|
||||||
|
half d;
|
||||||
|
uint8_t qs[QK4_0 / 2];
|
||||||
|
};
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// mul_vec_q_n_f32_flat_noshuffle
|
||||||
|
//
|
||||||
|
// This variation uses flat arrays (struct of arrays, SOA) representation for
|
||||||
|
// quant tensors. It also uses non shuffled bit order for weights.
|
||||||
|
//
|
||||||
|
// The shuffled version is kept in the original file because moving it here
|
||||||
|
// seems to result in worse performance for adreno.
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
kernel void kernel_convert_block_q4_0_noshuffle(
|
||||||
|
global struct block_q4_0 * src0,
|
||||||
|
global uchar * dst_q,
|
||||||
|
global half * dst_d
|
||||||
|
) {
|
||||||
|
global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
|
||||||
|
global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
|
||||||
|
global half * d = (global half *) dst_d + get_global_id(0);
|
||||||
|
|
||||||
|
*d = b->d;
|
||||||
|
for (int i = 0; i < QK4_0/4; ++i) {
|
||||||
|
uchar x0 = b->qs[2*i + 0];
|
||||||
|
uchar x1 = b->qs[2*i + 1];
|
||||||
|
|
||||||
|
q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4);
|
||||||
|
q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0);
|
||||||
|
|
||||||
|
#ifdef ADRENO_GPU
|
||||||
|
// Workaround for adreno - must have the following printf statement for
|
||||||
|
// the kernel to work properly. Otherwise it produces incorrect result.
|
||||||
|
// convert_uchar above also seems necessary.
|
||||||
|
// Compare against a large number so that it does not print anything.
|
||||||
|
// get_sub_group_local_id() also works.
|
||||||
|
if (get_global_id(0) == 65536*4096) {
|
||||||
|
printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
265
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
Normal file
265
ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
Normal file
|
@ -0,0 +1,265 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
|
||||||
|
// assume
|
||||||
|
#define QK4_0 32
|
||||||
|
#define N_SIMDGROUP 4
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
|
||||||
|
float shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
|
||||||
|
float8 shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
shared_y = sub_group_broadcast(y, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
|
||||||
|
shared_y = sub_group_broadcast(y, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
shared_y = sub_group_broadcast(y, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
|
||||||
|
|
||||||
|
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
__kernel void kernel_gemv_noshuffle(
|
||||||
|
__read_only image1d_buffer_t src0_q, // quantized A
|
||||||
|
global half2 * src0_d, // A scales
|
||||||
|
__read_only image1d_buffer_t src1, // B
|
||||||
|
ulong offset1, // offset to B (0)
|
||||||
|
global float * dst, // C
|
||||||
|
ulong offsetd, // offset to C (0)
|
||||||
|
uint K, // K
|
||||||
|
int ne01, // M
|
||||||
|
int ne02, // 1
|
||||||
|
int ne10, // K
|
||||||
|
int ne12, // 1
|
||||||
|
int ne0, // M
|
||||||
|
int ne1, // N
|
||||||
|
int r2, // 1
|
||||||
|
int r3)
|
||||||
|
{
|
||||||
|
uint groupId = get_local_id(1);
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
ushort slid = get_sub_group_local_id();
|
||||||
|
|
||||||
|
__private uint4 regA;
|
||||||
|
__private half2 regS;
|
||||||
|
__private float8 regB;
|
||||||
|
|
||||||
|
__private float2 totalSum = (float2)(0.0f);
|
||||||
|
|
||||||
|
// loop along K in block granularity, skip 4 blocks every iter
|
||||||
|
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
|
||||||
|
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
|
||||||
|
// first 4 fibers in each wave load 8 B values to its private scope
|
||||||
|
if (slid < 4) {
|
||||||
|
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||||
|
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
// load half weights for two blocks in consecutive rows
|
||||||
|
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||||
|
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||||
|
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||||
|
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||||
|
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#else
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
|
||||||
|
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||||
|
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||||
|
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||||
|
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||||
|
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#else
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduction in local memory, assumes #wave=4
|
||||||
|
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
|
||||||
|
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
||||||
|
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
||||||
|
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||||
|
|
||||||
|
// 2 outputs per fiber in wave 0
|
||||||
|
if (groupId == 0) {
|
||||||
|
dst = (global float*)((global char*)dst + offsetd);
|
||||||
|
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,271 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
|
||||||
|
// assume
|
||||||
|
#define QK4_0 32
|
||||||
|
#define N_SIMDGROUP 4
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
|
||||||
|
float shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 0); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 1); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 2); \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s0, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s1, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s2, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s3, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s4, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s5, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s6, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y.s7, 3); \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
|
||||||
|
float8 shared_y; \
|
||||||
|
shared_y = sub_group_broadcast(y, 0); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
shared_y = sub_group_broadcast(y, 1); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
|
||||||
|
|
||||||
|
#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
|
||||||
|
shared_y = sub_group_broadcast(y, 2); \
|
||||||
|
total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
shared_y = sub_group_broadcast(y, 3); \
|
||||||
|
total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
|
||||||
|
total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
|
||||||
|
total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
|
||||||
|
total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
|
||||||
|
total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
|
||||||
|
total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
|
||||||
|
total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||||
|
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||||
|
|
||||||
|
|
||||||
|
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
__kernel void kernel_gemv_noshuffle(
|
||||||
|
__read_only image1d_buffer_t src0_q, // quantized A
|
||||||
|
global half2 * src0_d, // A scales
|
||||||
|
__read_only image1d_buffer_t src1, // B
|
||||||
|
ulong offset1, // offset to B (0)
|
||||||
|
global float * dst, // C
|
||||||
|
ulong offsetd, // offset to C (0)
|
||||||
|
int ne00, // K
|
||||||
|
int ne01, // M
|
||||||
|
int ne02, // 1
|
||||||
|
int ne10, // K
|
||||||
|
int ne12, // 1
|
||||||
|
int ne0, // M
|
||||||
|
int ne1, // N
|
||||||
|
int r2, // 1
|
||||||
|
int r3)
|
||||||
|
{
|
||||||
|
uint groupId = get_local_id(1);
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
ushort slid = get_sub_group_local_id();
|
||||||
|
|
||||||
|
uint K = ne00;
|
||||||
|
uint M = ne01;
|
||||||
|
|
||||||
|
uint LINE_STRIDE_A = M / 2;
|
||||||
|
uint BLOCK_STRIDE_A = N_SIMDGROUP * M;
|
||||||
|
|
||||||
|
__private uint4 regA;
|
||||||
|
__private half2 regS;
|
||||||
|
__private float8 regB;
|
||||||
|
|
||||||
|
__private float2 totalSum = (float2)(0.0f);
|
||||||
|
|
||||||
|
// loop along K in block granularity, skip 4 blocks every iter
|
||||||
|
for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
|
||||||
|
regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
|
||||||
|
// first 4 fibers in each wave load 8 B values to its private scope
|
||||||
|
if (slid < 4) {
|
||||||
|
regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
|
||||||
|
regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
// load half weights for two blocks in consecutive rows
|
||||||
|
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
|
||||||
|
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
|
||||||
|
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
|
||||||
|
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
|
||||||
|
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#else
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
|
||||||
|
regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
|
||||||
|
regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
|
||||||
|
regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
|
||||||
|
regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
|
||||||
|
#ifdef VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#else
|
||||||
|
dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
|
||||||
|
#endif // VECTOR_SUB_GROUP_BROADCAT
|
||||||
|
}
|
||||||
|
|
||||||
|
// reduction in local memory, assumes #wave=4
|
||||||
|
__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
|
||||||
|
if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
|
||||||
|
if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
|
||||||
|
if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
|
||||||
|
if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];
|
||||||
|
|
||||||
|
// 2 outputs per fiber in wave 0
|
||||||
|
if (groupId == 0) {
|
||||||
|
dst = (global float*)((global char*)dst + offsetd);
|
||||||
|
vstore2(totalSum, 0, &(dst[gid * 2]));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
1225
ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
Normal file
1225
ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
Normal file
File diff suppressed because it is too large
Load diff
130
ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
Normal file
130
ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
Normal file
|
@ -0,0 +1,130 @@
|
||||||
|
// src0_q, src0_d, src1 are transposed as a preprocessing step
|
||||||
|
// 4-bit weights are transposed in groups of 4 (unsigned short int)
|
||||||
|
// consider weights originally "next to each other", now "on top of each other"
|
||||||
|
// each fiber computes a 8x4 tile of output elements
|
||||||
|
// using unshuffled weights
|
||||||
|
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||||
|
|
||||||
|
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||||
|
kernel void kernel_mul_mat_Ab_Bi_8x4(
|
||||||
|
global const ushort * src0_q, // quantized A
|
||||||
|
global const half * src0_d, // A scales
|
||||||
|
__read_only image1d_buffer_t src1, // B (1d image)
|
||||||
|
global float * dst, // C
|
||||||
|
int m, // M
|
||||||
|
int n, // N with padding
|
||||||
|
int k, // K
|
||||||
|
int n_no_padding // N without padding
|
||||||
|
) {
|
||||||
|
|
||||||
|
int m_4 = m >> 2;
|
||||||
|
int n_4 = n >> 2;
|
||||||
|
|
||||||
|
int gy = get_global_id(0);
|
||||||
|
int gx = get_global_id(1);
|
||||||
|
int gx_2 = gx << 2;
|
||||||
|
|
||||||
|
half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements
|
||||||
|
half8 B; // registers for activations
|
||||||
|
half4 dequantized_weights; // registers for dequantized weights
|
||||||
|
__global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights
|
||||||
|
__global const half* scale_ptr = src0_d + gx_2; // pointer for scales
|
||||||
|
|
||||||
|
for(int i=0; i<k; i+=4){ //loop through K dimension
|
||||||
|
|
||||||
|
B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4));
|
||||||
|
B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1);
|
||||||
|
|
||||||
|
// keep (i/4) and (i/32) in parenthesis, rounds down
|
||||||
|
// load 4 consecutive groups of 4 weights
|
||||||
|
ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s
|
||||||
|
|
||||||
|
// load 4 consecutive scales
|
||||||
|
half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements
|
||||||
|
|
||||||
|
// j=0
|
||||||
|
dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights
|
||||||
|
dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1;
|
||||||
|
dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2;
|
||||||
|
dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3;
|
||||||
|
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
||||||
|
c1 += B * dequantized_weights.s1;
|
||||||
|
c2 += B * dequantized_weights.s2;
|
||||||
|
c3 += B * dequantized_weights.s3;
|
||||||
|
|
||||||
|
// j=1
|
||||||
|
B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4));
|
||||||
|
B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1);
|
||||||
|
dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights
|
||||||
|
dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1;
|
||||||
|
dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2;
|
||||||
|
dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3;
|
||||||
|
c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate
|
||||||
|
c1 += B * dequantized_weights.s1;
|
||||||
|
c2 += B * dequantized_weights.s2;
|
||||||
|
c3 += B * dequantized_weights.s3;
|
||||||
|
|
||||||
|
// j=2
|
||||||
|
B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4));
|
||||||
|
B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1);
|
||||||
|
dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights
|
||||||
|
dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1;
|
||||||
|
dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2;
|
||||||
|
dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3;
|
||||||
|
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
||||||
|
c1 += B * dequantized_weights.s1;
|
||||||
|
c2 += B * dequantized_weights.s2;
|
||||||
|
c3 += B * dequantized_weights.s3;
|
||||||
|
|
||||||
|
// j=3
|
||||||
|
B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4));
|
||||||
|
B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1);
|
||||||
|
dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights
|
||||||
|
dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1;
|
||||||
|
dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2;
|
||||||
|
dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3;
|
||||||
|
c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate
|
||||||
|
c1 += B * dequantized_weights.s1;
|
||||||
|
c2 += B * dequantized_weights.s2;
|
||||||
|
c3 += B * dequantized_weights.s3;
|
||||||
|
}
|
||||||
|
|
||||||
|
int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements
|
||||||
|
|
||||||
|
// conditional check if store is to a valid location. Required when N is not a multiple of 8
|
||||||
|
// if statements allow registers to be reused for each store
|
||||||
|
// provides a performance boost due to reduced register footprint, which increases number of concurrent waves
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx);
|
||||||
|
idx += m;
|
||||||
|
}
|
||||||
|
if(idx+3 < m*n_no_padding){
|
||||||
|
vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx);
|
||||||
|
}
|
||||||
|
}
|
32
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
Normal file
32
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
// 16-bit transpose, loading/storing an 8x8 tile of elements
|
||||||
|
|
||||||
|
kernel void kernel_transpose_16(
|
||||||
|
__read_only image1d_buffer_t input,
|
||||||
|
__write_only image1d_buffer_t output,
|
||||||
|
const uint rows,
|
||||||
|
const uint cols
|
||||||
|
) {
|
||||||
|
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
const int j = get_global_id(1);
|
||||||
|
const int i_3 = i<<3;
|
||||||
|
const int j_3 = j<<3;
|
||||||
|
|
||||||
|
ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
|
||||||
|
ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
|
||||||
|
ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
|
||||||
|
ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
|
||||||
|
ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
|
||||||
|
ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
|
||||||
|
ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
|
||||||
|
ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
|
||||||
|
|
||||||
|
write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
|
||||||
|
write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
|
||||||
|
write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
|
||||||
|
write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
|
||||||
|
write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
|
||||||
|
write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
|
||||||
|
write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
|
||||||
|
write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
|
||||||
|
}
|
25
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
Normal file
25
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
||||||
|
|
||||||
|
kernel void kernel_transpose_32(
|
||||||
|
__read_only image1d_buffer_t input,
|
||||||
|
__write_only image1d_buffer_t output,
|
||||||
|
const uint rows,
|
||||||
|
const uint cols
|
||||||
|
) {
|
||||||
|
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
const int j = get_global_id(1);
|
||||||
|
const int i_2 = i<<2;
|
||||||
|
const int j_2 = j<<2;
|
||||||
|
|
||||||
|
float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
|
||||||
|
float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
|
||||||
|
float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
|
||||||
|
float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
|
||||||
|
|
||||||
|
write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
|
||||||
|
write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
||||||
|
write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
||||||
|
write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
||||||
|
|
||||||
|
}
|
35
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
Normal file
35
ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
// 32-bit transpose, loading/storing a 4x4 tile of elements
|
||||||
|
// Only used for activations
|
||||||
|
// converts to FP16
|
||||||
|
// also adds zero padding for non multiple of 8 prompt lengths
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
|
||||||
|
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
const int j = get_global_id(1);
|
||||||
|
const int i_2 = i<<2;
|
||||||
|
const int j_2 = j<<2;
|
||||||
|
half4 temp0 = {0,0,0,0}; // initialize outputs to 0
|
||||||
|
half4 temp1 = {0,0,0,0};
|
||||||
|
half4 temp2 = {0,0,0,0};
|
||||||
|
half4 temp3 = {0,0,0,0};
|
||||||
|
|
||||||
|
if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
|
||||||
|
temp0 = read_imageh(input, (j_2+0)*cols+i);
|
||||||
|
}
|
||||||
|
if((j_2+1)*cols+i*4+3 < rows*cols*16){
|
||||||
|
temp1 = read_imageh(input, (j_2+1)*cols+i);
|
||||||
|
}
|
||||||
|
if((j_2+2)*cols+i*4+3 < rows*cols*16){
|
||||||
|
temp2 = read_imageh(input, (j_2+2)*cols+i);
|
||||||
|
}
|
||||||
|
if((j_2+3)*cols+i*4+3 < rows*cols*16){
|
||||||
|
temp3 = read_imageh(input, (j_2+3)*cols+i);
|
||||||
|
}
|
||||||
|
|
||||||
|
write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
|
||||||
|
write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
|
||||||
|
write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
|
||||||
|
write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
|
||||||
|
}
|
|
@ -11,6 +11,7 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "common.hpp"
|
#include "common.hpp"
|
||||||
|
#include "ggml-impl.h"
|
||||||
|
|
||||||
int get_current_device_id() {
|
int get_current_device_id() {
|
||||||
return dpct::dev_mgr::instance().current_device_id();
|
return dpct::dev_mgr::instance().current_device_id();
|
||||||
|
@ -28,11 +29,7 @@ void* ggml_sycl_host_malloc(size_t size) try {
|
||||||
|
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
// clear the error
|
// clear the error
|
||||||
fprintf(
|
GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0, "syclGetErrorString is not supported");
|
||||||
stderr,
|
|
||||||
"WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
|
||||||
size / 1024.0 / 1024.0,
|
|
||||||
"syclGetErrorString is not supported");
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,18 +63,12 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
|
||||||
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
const ggml_sycl_op_flatten_t op) try {
|
const ggml_sycl_op_flatten_t op) try {
|
||||||
const int64_t nrows0 = ggml_nrows(src0);
|
|
||||||
|
|
||||||
const bool use_src1 = src1 != nullptr;
|
const bool use_src1 = src1 != nullptr;
|
||||||
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
|
||||||
|
|
||||||
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
|
||||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
||||||
|
|
||||||
// dd = data device
|
// dd = data device
|
||||||
float * src0_ddf = (float *) src0->data;
|
float * src0_ddf = (float *) src0->data;
|
||||||
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
|
float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
|
||||||
|
|
|
@ -626,6 +626,7 @@ struct bin_bcast_sycl {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ static void concat_f32_dim1(const float *x, const float *y, float *dst,
|
||||||
// operation
|
// operation
|
||||||
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
||||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||||
if (item_ct1.get_group(1) < ne01) { // src0
|
if (item_ct1.get_group(1) < (size_t) ne01) { // src0
|
||||||
int offset_src =
|
int offset_src =
|
||||||
nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
|
nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01;
|
||||||
dst[offset_dst] = x[offset_src];
|
dst[offset_dst] = x[offset_src];
|
||||||
|
@ -70,7 +70,7 @@ static void concat_f32_dim2(const float *x, const float *y, float *dst,
|
||||||
// operation
|
// operation
|
||||||
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
||||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||||
if (item_ct1.get_group(0) < ne02) { // src0
|
if (item_ct1.get_group(0) < (size_t) ne02) { // src0
|
||||||
int offset_src = nidx + item_ct1.get_group(1) * ne0 +
|
int offset_src = nidx + item_ct1.get_group(1) * ne0 +
|
||||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||||
dst[offset_dst] = x[offset_src];
|
dst[offset_dst] = x[offset_src];
|
||||||
|
|
|
@ -424,7 +424,7 @@ static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y,
|
||||||
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
|
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
|
||||||
|
|
||||||
// make each work-item deal with more elements since sycl global range can not exceed max int
|
// make each work-item deal with more elements since sycl global range can not exceed max int
|
||||||
const src_t * x = (src_t *) vx;
|
const src_t * x = (const src_t *) vx;
|
||||||
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
|
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
|
||||||
y[i] = x[i];
|
y[i] = x[i];
|
||||||
}
|
}
|
||||||
|
|
|
@ -1015,9 +1015,9 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_ddq_i;
|
GGML_UNUSED(src1_ddq_i);
|
||||||
(void) src1_ncols;
|
GGML_UNUSED(src1_ncols);
|
||||||
(void) src1_padded_row_size;
|
GGML_UNUSED(src1_padded_row_size);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1237,7 +1237,7 @@ namespace dpct
|
||||||
|
|
||||||
std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
|
std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
|
||||||
{
|
{
|
||||||
auto it = m_map.upper_bound((byte_t *)ptr);
|
auto it = m_map.upper_bound(const_cast<byte_t *>(reinterpret_cast<const byte_t *>(ptr)));
|
||||||
if (it == m_map.end())
|
if (it == m_map.end())
|
||||||
{
|
{
|
||||||
// Not a virtual pointer.
|
// Not a virtual pointer.
|
||||||
|
|
|
@ -237,7 +237,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
|
||||||
int i02 = i12 / sf2;
|
int i02 = i12 / sf2;
|
||||||
int i03 = i13 / sf3;
|
int i03 = i13 / sf3;
|
||||||
|
|
||||||
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
||||||
}
|
}
|
||||||
|
|
||||||
void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
||||||
|
@ -251,8 +251,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i
|
||||||
// operation
|
// operation
|
||||||
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
|
||||||
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
|
||||||
if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
|
if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
|
||||||
item_ct1.get_group(0) < ne02) {
|
|
||||||
int offset_src = nidx + item_ct1.get_group(1) * ne00 +
|
int offset_src = nidx + item_ct1.get_group(1) * ne00 +
|
||||||
item_ct1.get_group(0) * ne00 * ne01;
|
item_ct1.get_group(0) * ne00 * ne01;
|
||||||
dst[offset_dst] = x[offset_src];
|
dst[offset_dst] = x[offset_src];
|
||||||
|
@ -520,9 +519,10 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
@ -535,9 +535,10 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
|
@ -550,9 +551,10 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_
|
||||||
|
|
||||||
gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
@ -564,9 +566,10 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
@ -579,9 +582,10 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -595,9 +599,10 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml
|
||||||
|
|
||||||
hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -610,9 +615,10 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_t
|
||||||
|
|
||||||
hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -625,9 +631,10 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -640,9 +647,10 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -655,9 +663,10 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||||
|
|
||||||
sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -670,9 +679,10 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -685,9 +695,10 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -700,9 +711,10 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -715,9 +727,10 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -730,9 +743,10 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -749,9 +763,10 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_
|
||||||
|
|
||||||
leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
|
leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
@ -764,9 +779,10 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -787,9 +803,10 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
||||||
main_stream);
|
main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
@ -805,9 +822,10 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
src0->ne[0], src0->ne[1], src0->ne[2],
|
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||||
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
@ -827,7 +845,8 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
|
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
|
||||||
|
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
|
|
@ -51,8 +51,8 @@ public:
|
||||||
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
||||||
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
||||||
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
||||||
auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
|
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
||||||
auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
|
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
||||||
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
||||||
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
||||||
|
|
||||||
|
@ -79,8 +79,8 @@ public:
|
||||||
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
||||||
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
||||||
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
||||||
auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
|
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
||||||
auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
|
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
||||||
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
||||||
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||||
|
|
||||||
info.device_count = dpct::dev_mgr::instance().device_count();
|
info.device_count = dpct::dev_mgr::instance().device_count();
|
||||||
if (info.device_count == 0) {
|
if (info.device_count == 0) {
|
||||||
GGML_LOG_ERROR("%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
|
GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__);
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
|
||||||
#else
|
#else
|
||||||
GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
GGML_LOG_INFO("%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
|
GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME);
|
||||||
|
|
||||||
for (int i = 0; i < info.device_count; ++i) {
|
for (int i = 0; i < info.device_count; ++i) {
|
||||||
info.devices[i].vmm = 0;
|
info.devices[i].vmm = 0;
|
||||||
|
@ -137,7 +137,6 @@ void ggml_backend_sycl_print_sycl_devices() {
|
||||||
|
|
||||||
for (int id = 0; id < device_count; ++id) {
|
for (int id = 0; id < device_count; ++id) {
|
||||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||||
sycl::backend backend = device.get_backend();
|
|
||||||
std::string backend_type = get_device_backend_and_type(device);
|
std::string backend_type = get_device_backend_and_type(device);
|
||||||
int type_id = DeviceNums[backend_type]++;
|
int type_id = DeviceNums[backend_type]++;
|
||||||
std::stringstream device_type;
|
std::stringstream device_type;
|
||||||
|
@ -420,14 +419,12 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
GGML_UNUSED(buffer);
|
||||||
catch (sycl::exception const &exc) {
|
} catch (const sycl::exception & exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
|
||||||
<< ", line:" << __LINE__ << std::endl;
|
|
||||||
std::exit(1);
|
std::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
||||||
uint8_t value) try {
|
uint8_t value) try {
|
||||||
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
|
||||||
|
@ -1092,10 +1089,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||||
ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
|
ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
|
||||||
size_t pool_size = 0;
|
size_t pool_size = 0;
|
||||||
|
|
||||||
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
|
explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
|
||||||
qptr(qptr_),
|
|
||||||
device(device_) {
|
|
||||||
}
|
|
||||||
|
|
||||||
~ggml_sycl_pool_leg() {
|
~ggml_sycl_pool_leg() {
|
||||||
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
|
||||||
|
@ -1238,7 +1232,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
|
||||||
zeros[i] = 0.f;
|
zeros[i] = 0.f;
|
||||||
qzeros[i] = 0;
|
qzeros[i] = 0;
|
||||||
}
|
}
|
||||||
const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
|
const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
|
||||||
float sum = xi[0];
|
float sum = xi[0];
|
||||||
float amax = sycl::fabs(xi[0]);
|
float amax = sycl::fabs(xi[0]);
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -1799,6 +1793,9 @@ static void pool2d_nchw_kernel(
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case GGML_OP_POOL_AVG: res = 0; break;
|
case GGML_OP_POOL_AVG: res = 0; break;
|
||||||
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
||||||
|
default:
|
||||||
|
res = (To) sycl::nan(uint32_t(0));
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = bh; i < eh; i += 1) {
|
for (int i = bh; i < eh; i += 1) {
|
||||||
|
@ -1817,6 +1814,9 @@ static void pool2d_nchw_kernel(
|
||||||
switch (op) {
|
switch (op) {
|
||||||
case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
|
case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
|
||||||
case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
|
case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
|
||||||
|
default:
|
||||||
|
res = (To) sycl::nan(uint32_t(0));
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1855,7 +1855,8 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||||
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
|
||||||
});
|
});
|
||||||
|
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename src0_t>
|
template <typename src0_t>
|
||||||
|
@ -1893,10 +1894,10 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
|
||||||
const int ky, const int kx_padded,
|
const int ky, const int kx_padded,
|
||||||
queue_ptr stream) {
|
queue_ptr stream) {
|
||||||
|
@ -2464,8 +2465,8 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||||
|
|
||||||
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) src1_d;
|
GGML_UNUSED(src1_d);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -2484,17 +2485,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
|
||||||
|
|
||||||
const int64_t row_diff = row_high - row_low;
|
const int64_t row_diff = row_high - row_low;
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
SYCL_CHECK(
|
SYCL_CHECK(
|
||||||
CHECK_TRY_ERROR(id = get_current_device_id()));
|
CHECK_TRY_ERROR(id = get_current_device_id()));
|
||||||
|
#if !GGML_SYCL_DNNL
|
||||||
|
const int64_t ne0 = dst->ne[0];
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// ldc == nrows of the matrix that cuBLAS writes into
|
// ldc == nrows of the matrix that cuBLAS writes into
|
||||||
int ldc = id == ctx.device ? ne0 : row_diff;
|
int ldc = id == ctx.device ? ne0 : row_diff;
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_SYCL_F16
|
#ifdef GGML_SYCL_F16
|
||||||
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
||||||
|
@ -2531,9 +2533,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||||
: src1_as_f16.get();
|
: src1_as_f16.get();
|
||||||
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
|
||||||
|
|
||||||
|
#if !GGML_SYCL_DNNL
|
||||||
const sycl::half alpha_f16 = 1.0f;
|
const sycl::half alpha_f16 = 1.0f;
|
||||||
const sycl::half beta_f16 = 0.0f;
|
const sycl::half beta_f16 = 0.0f;
|
||||||
#if !GGML_SYCL_DNNL
|
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
|
||||||
*stream, oneapi::mkl::transpose::trans,
|
*stream, oneapi::mkl::transpose::trans,
|
||||||
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
|
||||||
|
@ -2570,9 +2572,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||||
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
|
||||||
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
|
||||||
|
|
||||||
|
#if !GGML_SYCL_DNNL
|
||||||
const float alpha = 1.0f;
|
const float alpha = 1.0f;
|
||||||
const float beta = 0.0f;
|
const float beta = 0.0f;
|
||||||
#if !GGML_SYCL_DNNL
|
|
||||||
# ifdef GGML_SYCL_NVIDIA
|
# ifdef GGML_SYCL_NVIDIA
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
|
||||||
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
|
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
|
||||||
|
@ -2590,9 +2592,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||||
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
|
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_ddq_i;
|
GGML_UNUSED(src1_ddq_i);
|
||||||
(void) src1_padded_row_size;
|
GGML_UNUSED(src1_padded_row_size);
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
@ -2638,8 +2640,9 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||||
item_ct1);
|
item_ct1);
|
||||||
});
|
});
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -2654,9 +2657,10 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
|
|
||||||
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
|
sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -2673,9 +2677,10 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_te
|
||||||
|
|
||||||
sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
|
sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -2694,9 +2699,10 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||||
|
|
||||||
argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -2713,9 +2719,10 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||||
|
|
||||||
argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
|
argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
||||||
|
@ -2735,9 +2742,10 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const gg
|
||||||
|
|
||||||
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
@ -2758,9 +2766,10 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tenso
|
||||||
*/
|
*/
|
||||||
SYCL_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
|
@ -2783,9 +2792,10 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso
|
||||||
*/
|
*/
|
||||||
SYCL_CHECK(0);
|
SYCL_CHECK(0);
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
|
||||||
|
@ -2862,7 +2872,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
||||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
||||||
|
|
||||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||||
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
||||||
|
@ -3289,7 +3298,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS
|
GGML_TENSOR_BINARY_OP_LOCALS
|
||||||
|
|
||||||
const int64_t ne_dst = ggml_nelements(dst);
|
|
||||||
|
|
||||||
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
||||||
queue_ptr main_stream = ctx.stream();;
|
queue_ptr main_stream = ctx.stream();;
|
||||||
|
@ -3397,6 +3405,7 @@ catch (sycl::exception const &exc) {
|
||||||
|
|
||||||
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
|
||||||
// TODO: accuracy issues in MMQ
|
// TODO: accuracy issues in MMQ
|
||||||
|
GGML_UNUSED(type);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3772,7 +3781,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
@ -3783,7 +3792,7 @@ catch (sycl::exception const &exc) {
|
||||||
static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
// TODO: why do we pass dst as src1 here?
|
// TODO: why do we pass dst as src1 here?
|
||||||
ggml_sycl_cpy(ctx, src0, dst, nullptr);
|
ggml_sycl_cpy(ctx, src0, dst, nullptr);
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
@ -3828,13 +3837,16 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
(void) src0;
|
GGML_UNUSED(src0);
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_sycl_set_main_device(const int main_device) try {
|
void ggml_sycl_set_main_device(const int main_device) try {
|
||||||
if (dpct::get_current_device_id() == main_device) return;
|
if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
check_allow_gpu_index(main_device);
|
check_allow_gpu_index(main_device);
|
||||||
dpct::select_device(main_device);
|
dpct::select_device(main_device);
|
||||||
|
|
||||||
|
@ -4202,6 +4214,7 @@ try
|
||||||
{
|
{
|
||||||
ggml_backend_sycl_context *sycl_ctx =
|
ggml_backend_sycl_context *sycl_ctx =
|
||||||
(ggml_backend_sycl_context *)backend->context;
|
(ggml_backend_sycl_context *)backend->context;
|
||||||
|
|
||||||
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
|
||||||
|
|
||||||
const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
|
||||||
|
@ -4216,7 +4229,7 @@ catch (sycl::exception const &exc)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
|
static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
|
||||||
ggml_backend_sycl_context* sycl_ctx = static_cast<ggml_backend_sycl_context*>(backend->context);
|
|
||||||
sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
|
sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
|
||||||
|
|
||||||
if (ggml_backend_is_sycl(backend)) {
|
if (ggml_backend_is_sycl(backend)) {
|
||||||
|
@ -4475,7 +4488,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
|
{
|
||||||
|
const int mode = ((const int32_t *) op->op_params)[2];
|
||||||
|
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
|
}
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
// TODO: add support for the new F32 operations
|
// TODO: add support for the new F32 operations
|
||||||
return op->src[0]->type == GGML_TYPE_F16;
|
return op->src[0]->type == GGML_TYPE_F16;
|
||||||
|
@ -4624,6 +4646,7 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
|
||||||
// SYCL doesn't support registering host memory, left here for reference
|
// SYCL doesn't support registering host memory, left here for reference
|
||||||
// "ggml_backend_register_host_buffer"
|
// "ggml_backend_register_host_buffer"
|
||||||
// "ggml_backend_unregister_host_buffer"
|
// "ggml_backend_unregister_host_buffer"
|
||||||
|
GGML_UNUSED(name);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -120,6 +120,7 @@ void ggml_sycl_op_im2col(
|
||||||
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) src0;
|
GGML_UNUSED(src0);
|
||||||
(void) src0_dd;
|
GGML_UNUSED(src0_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
|
@ -813,7 +813,7 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
||||||
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
constexpr int blocks_per_tile_x_row = QI4_K > WARP_SIZE ? 1 : WARP_SIZE / QI4_K; // == 1 if QK_K == 256
|
||||||
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -961,7 +961,7 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
||||||
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
constexpr int blocks_per_tile_x_row = QI5_K > WARP_SIZE ? 1 : WARP_SIZE / QI5_K; // == 1 if QK_K == 256
|
||||||
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
@ -1109,7 +1109,7 @@ load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
|
||||||
dpct::sub_sat());
|
dpct::sub_sat());
|
||||||
}
|
}
|
||||||
|
|
||||||
const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
constexpr int blocks_per_tile_x_row = QI6_K > WARP_SIZE ? 1 : WARP_SIZE / QI6_K; // == 1 if QK_K == 256
|
||||||
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256
|
||||||
float * x_dmf = (float *) x_dm;
|
float * x_dmf = (float *) x_dm;
|
||||||
|
|
||||||
|
@ -3020,9 +3020,9 @@ void ggml_sycl_op_mul_mat_q(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_ddf_i;
|
GGML_UNUSED(src1_ddf_i);
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
|
|
@ -753,11 +753,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, QK_WARP_SIZE);
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler & cgh) {
|
stream->submit([&](sycl::handler & cgh) {
|
||||||
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
|
||||||
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
@ -780,9 +776,6 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0];
|
|
||||||
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
@ -805,9 +798,6 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0];
|
|
||||||
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
@ -830,8 +820,6 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
auto iq3s_grid_ptr_ct1 = &iq3s_grid[0];
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
@ -854,9 +842,6 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
||||||
{
|
{
|
||||||
|
|
||||||
stream->submit([&](sycl::handler &cgh) {
|
stream->submit([&](sycl::handler &cgh) {
|
||||||
auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0];
|
|
||||||
auto ksigns64_ptr_ct1 = &ksigns64[0];
|
|
||||||
|
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1)
|
[=](sycl::nd_item<3> item_ct1)
|
||||||
|
@ -954,7 +939,7 @@ void ggml_sycl_op_mul_mat_vec_q(
|
||||||
const size_t q8_1_bs = QK8_1;
|
const size_t q8_1_bs = QK8_1;
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||||
const int64_t nrows_dst = id == ctx.device ? ne00 : row_diff;
|
|
||||||
for (int i = 0; i < src1_ncols; i++)
|
for (int i = 0; i < src1_ncols; i++)
|
||||||
{
|
{
|
||||||
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
|
const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
|
||||||
|
@ -1023,7 +1008,8 @@ void ggml_sycl_op_mul_mat_vec_q(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_ddf_i;
|
GGML_UNUSED(src1_ddf_i);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
|
||||||
*/
|
*/
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
mean_var = 0.f;
|
mean_var = 0.f;
|
||||||
int nreduce = nwarps / WARP_SIZE;
|
size_t nreduce = nwarps / WARP_SIZE;
|
||||||
for (size_t i = 0; i < nreduce; i += 1)
|
for (size_t i = 0; i < nreduce; i += 1)
|
||||||
{
|
{
|
||||||
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
mean_var += s_sum[lane_id + i * WARP_SIZE];
|
||||||
|
@ -55,7 +55,7 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
|
||||||
const int nthreads = item_ct1.get_local_range(2);
|
const int nthreads = item_ct1.get_local_range(2);
|
||||||
const int nwarps = nthreads / WARP_SIZE;
|
const int nwarps = nthreads / WARP_SIZE;
|
||||||
start += item_ct1.get_local_id(2);
|
start += item_ct1.get_local_id(2);
|
||||||
int nreduce = nwarps / WARP_SIZE;
|
size_t nreduce = nwarps / WARP_SIZE;
|
||||||
|
|
||||||
if (end >= ne_elements) {
|
if (end >= ne_elements) {
|
||||||
end = ne_elements;
|
end = ne_elements;
|
||||||
|
@ -163,7 +163,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
|
||||||
converged control flow. You may need to adjust the code.
|
converged control flow. You may need to adjust the code.
|
||||||
*/
|
*/
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
int nreduce = nwarps / WARP_SIZE;
|
size_t nreduce = nwarps / WARP_SIZE;
|
||||||
tmp = 0.f;
|
tmp = 0.f;
|
||||||
for (size_t i = 0; i < nreduce; i += 1)
|
for (size_t i = 0; i < nreduce; i += 1)
|
||||||
{
|
{
|
||||||
|
@ -352,6 +352,7 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor*
|
||||||
(void)src1;
|
(void)src1;
|
||||||
(void)dst;
|
(void)dst;
|
||||||
(void)src1_dd;
|
(void)src1_dd;
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0,
|
||||||
|
|
|
@ -269,7 +269,8 @@ void ggml_sycl_op_rope(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) src1;
|
GGML_UNUSED(src1);
|
||||||
(void) dst;
|
GGML_UNUSED(dst);
|
||||||
(void) src1_dd;
|
GGML_UNUSED(src1_dd);
|
||||||
|
GGML_UNUSED(ctx);
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
||||||
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
const int nthreads = block_size;
|
const int nthreads = block_size;
|
||||||
const int nwarps = nthreads / WARP_SIZE;
|
const int nwarps = nthreads / WARP_SIZE;
|
||||||
int nreduce = nwarps / WARP_SIZE;
|
size_t nreduce = nwarps / WARP_SIZE;
|
||||||
float slope = 1.0f;
|
float slope = 1.0f;
|
||||||
|
|
||||||
// ALiBi
|
// ALiBi
|
||||||
|
@ -53,9 +53,10 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
||||||
if (block_size > WARP_SIZE) {
|
if (block_size > WARP_SIZE) {
|
||||||
if (warp_id == 0) {
|
if (warp_id == 0) {
|
||||||
buf[lane_id] = -INFINITY;
|
buf[lane_id] = -INFINITY;
|
||||||
for (size_t i = 1; i < nreduce; i += 1)
|
for (size_t i = 1; i < nreduce; i += 1) {
|
||||||
buf[lane_id + i * WARP_SIZE] = -INFINITY;
|
buf[lane_id + i * WARP_SIZE] = -INFINITY;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
|
|
||||||
if (lane_id == 0) {
|
if (lane_id == 0) {
|
||||||
|
@ -63,8 +64,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
||||||
}
|
}
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
max_val = buf[lane_id];
|
max_val = buf[lane_id];
|
||||||
for (size_t i = 1; i < nreduce; i += 1)
|
for (size_t i = 1; i < nreduce; i += 1) {
|
||||||
{
|
|
||||||
max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
|
max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]);
|
||||||
}
|
}
|
||||||
max_val = warp_reduce_max(max_val, item_ct1);
|
max_val = warp_reduce_max(max_val, item_ct1);
|
||||||
|
@ -89,9 +89,10 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
if (warp_id == 0) {
|
if (warp_id == 0) {
|
||||||
buf[lane_id] = 0.f;
|
buf[lane_id] = 0.f;
|
||||||
for (size_t i = 1; i < nreduce; i += 1)
|
for (size_t i = 1; i < nreduce; i += 1) {
|
||||||
buf[lane_id + i * WARP_SIZE] = 0.f;
|
buf[lane_id + i * WARP_SIZE] = 0.f;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
|
|
||||||
if (lane_id == 0) {
|
if (lane_id == 0) {
|
||||||
|
@ -100,8 +101,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
|
||||||
item_ct1.barrier(sycl::access::fence_space::local_space);
|
item_ct1.barrier(sycl::access::fence_space::local_space);
|
||||||
|
|
||||||
tmp = buf[lane_id];
|
tmp = buf[lane_id];
|
||||||
for (size_t i = 1; i < nreduce; i += 1)
|
for (size_t i = 1; i < nreduce; i += 1) {
|
||||||
{
|
|
||||||
tmp += buf[lane_id + i * WARP_SIZE];
|
tmp += buf[lane_id + i * WARP_SIZE];
|
||||||
}
|
}
|
||||||
tmp = warp_reduce_sum(tmp, item_ct1);
|
tmp = warp_reduce_sum(tmp, item_ct1);
|
||||||
|
|
|
@ -68,4 +68,5 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, const ggml
|
||||||
const int max_period = dst->op_params[1];
|
const int max_period = dst->op_params[1];
|
||||||
|
|
||||||
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
|
timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
|
||||||
|
GGML_UNUSED(src1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,7 +59,7 @@ static void rwkv_wkv_f32_kernel(
|
||||||
float y = 0;
|
float y = 0;
|
||||||
|
|
||||||
// Process in chunks of 4 for better vectorization
|
// Process in chunks of 4 for better vectorization
|
||||||
sycl::float4 k4, r4, tf4, td4, s4, kv4;
|
sycl::float4 k4, r4, tf4, td4, s4;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < head_size; j += 4) {
|
for (int j = 0; j < head_size; j += 4) {
|
||||||
// Load data in vec4 chunks
|
// Load data in vec4 chunks
|
||||||
|
@ -135,4 +135,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, const ggml_tensor* s
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
GGML_UNUSED(src0);
|
||||||
|
GGML_UNUSED(src1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_critical_section_start(void);
|
GGML_API void ggml_critical_section_start(void);
|
||||||
void ggml_critical_section_end(void);
|
GGML_API void ggml_critical_section_end(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -163,7 +163,11 @@ struct vk_device_struct {
|
||||||
uint32_t shader_core_count;
|
uint32_t shader_core_count;
|
||||||
bool uma;
|
bool uma;
|
||||||
bool float_controls_rte_fp16;
|
bool float_controls_rte_fp16;
|
||||||
bool coopmat2;
|
|
||||||
|
bool subgroup_size_control;
|
||||||
|
uint32_t subgroup_min_size;
|
||||||
|
uint32_t subgroup_max_size;
|
||||||
|
bool subgroup_require_full_support;
|
||||||
|
|
||||||
bool coopmat_support;
|
bool coopmat_support;
|
||||||
bool coopmat_acc_f32_support;
|
bool coopmat_acc_f32_support;
|
||||||
|
@ -171,6 +175,7 @@ struct vk_device_struct {
|
||||||
uint32_t coopmat_m;
|
uint32_t coopmat_m;
|
||||||
uint32_t coopmat_n;
|
uint32_t coopmat_n;
|
||||||
uint32_t coopmat_k;
|
uint32_t coopmat_k;
|
||||||
|
bool coopmat2;
|
||||||
|
|
||||||
size_t idx;
|
size_t idx;
|
||||||
|
|
||||||
|
@ -757,8 +762,12 @@ static uint32_t compile_count = 0;
|
||||||
static std::mutex compile_count_mutex;
|
static std::mutex compile_count_mutex;
|
||||||
static std::condition_variable compile_count_cond;
|
static std::condition_variable compile_count_cond;
|
||||||
|
|
||||||
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align, bool disable_robustness) {
|
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint,
|
||||||
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
|
||||||
|
uint32_t align, bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
|
||||||
|
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size <<
|
||||||
|
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align <<
|
||||||
|
", " << disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
|
||||||
GGML_ASSERT(parameter_count > 0);
|
GGML_ASSERT(parameter_count > 0);
|
||||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||||
|
|
||||||
|
@ -817,14 +826,28 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
||||||
specialization_constants.data()
|
specialization_constants.data()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{};
|
||||||
|
|
||||||
|
if (device->subgroup_require_full_support && require_full_subgroups) {
|
||||||
|
pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT;
|
||||||
|
}
|
||||||
|
|
||||||
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
|
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
|
||||||
vk::PipelineShaderStageCreateFlags(),
|
pipeline_shader_stage_create_flags,
|
||||||
vk::ShaderStageFlagBits::eCompute,
|
vk::ShaderStageFlagBits::eCompute,
|
||||||
pipeline->shader_module,
|
pipeline->shader_module,
|
||||||
entrypoint.c_str(),
|
entrypoint.c_str(),
|
||||||
&specialization_info);
|
&specialization_info);
|
||||||
|
|
||||||
|
vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info;
|
||||||
|
pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size;
|
||||||
|
if (device->subgroup_size_control && required_subgroup_size > 0) {
|
||||||
|
GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size);
|
||||||
|
pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info);
|
||||||
|
}
|
||||||
|
|
||||||
vk::ComputePipelineCreateInfo compute_pipeline_create_info(
|
vk::ComputePipelineCreateInfo compute_pipeline_create_info(
|
||||||
vk::PipelineCreateFlags(),
|
vk::PipelineCreateFlags{},
|
||||||
pipeline_shader_create_info,
|
pipeline_shader_create_info,
|
||||||
pipeline->layout);
|
pipeline->layout);
|
||||||
|
|
||||||
|
@ -1504,7 +1527,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||||
|
|
||||||
std::vector<std::future<void>> compiles;
|
std::vector<std::future<void>> compiles;
|
||||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) {
|
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
|
||||||
|
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
||||||
|
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
||||||
{
|
{
|
||||||
// wait until fewer than N compiles are in progress
|
// wait until fewer than N compiles are in progress
|
||||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||||
|
@ -1514,7 +1539,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
}
|
}
|
||||||
compile_count++;
|
compile_count++;
|
||||||
}
|
}
|
||||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
|
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint,
|
||||||
|
parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness, require_full_subgroups, required_subgroup_size));
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||||
|
@ -1620,28 +1646,33 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
if (device->mul_mat ## ID ## _l) \
|
if (device->mul_mat ## ID ## _l) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \
|
||||||
if (device->mul_mat ## ID ## _m) \
|
if (device->mul_mat ## ID ## _m) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \
|
||||||
if (device->mul_mat ## ID ## _s) \
|
if (device->mul_mat ## ID ## _s) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \
|
||||||
if (device->mul_mat ## ID ## _l) \
|
if (device->mul_mat ## ID ## _l) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \
|
||||||
if (device->mul_mat ## ID ## _m) \
|
if (device->mul_mat ## ID ## _m) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \
|
||||||
if (device->mul_mat ## ID ## _s) \
|
if (device->mul_mat ## ID ## _s) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \
|
||||||
|
|
||||||
// Create 2 variants, {f16,f32} accumulator
|
// Create 2 variants, {f16,f32} accumulator
|
||||||
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
if (device->coopmat_acc_f16_support) { \
|
||||||
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
} \
|
||||||
|
if (device->coopmat_acc_f32_support) { \
|
||||||
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
} \
|
||||||
|
|
||||||
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
|
|
||||||
|
if (device->coopmat_acc_f16_support) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
@ -1654,6 +1685,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
} else {
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
|
||||||
|
}
|
||||||
|
|
||||||
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
|
||||||
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
|
if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) {
|
||||||
|
@ -1661,6 +1706,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||||
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
|
||||||
|
|
||||||
|
if (device->coopmat_acc_f16_support) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
@ -1673,7 +1719,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
} else {
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
#undef CREATE_MM2
|
||||||
#undef CREATE_MM
|
#undef CREATE_MM
|
||||||
} else if (device->fp16) {
|
} else if (device->fp16) {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
|
@ -1691,6 +1752,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
if (device->mul_mat ## ID ## _s) \
|
if (device->mul_mat ## ID ## _s) \
|
||||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
||||||
|
|
||||||
|
// Create 2 variants, {f16,f32} accumulator
|
||||||
|
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||||
|
|
||||||
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
|
||||||
|
@ -1728,6 +1794,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
}
|
}
|
||||||
|
#undef CREATE_MM2
|
||||||
#undef CREATE_MM
|
#undef CREATE_MM
|
||||||
} else {
|
} else {
|
||||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||||
|
@ -1782,53 +1849,58 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
|
||||||
}
|
}
|
||||||
#undef CREATE_MM2
|
|
||||||
#undef CREATE_MM
|
#undef CREATE_MM
|
||||||
}
|
}
|
||||||
|
|
||||||
// mul mat vec
|
// mul mat vec
|
||||||
// computing two rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
|
|
||||||
|
// AMD GCN and Intel graphics cards perform best when the number of rows per shader is doubled
|
||||||
|
uint32_t rm = 1;
|
||||||
|
if ((device->vendor_id == VK_VENDOR_ID_AMD && device->subgroup_min_size == 64 && device->subgroup_max_size == 64) || device->vendor_id == VK_VENDOR_ID_INTEL)
|
||||||
|
rm = 2;
|
||||||
|
|
||||||
|
// computing additional rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0.
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
|
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true);
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true);
|
||||||
|
|
||||||
// dequant shaders
|
// dequant shaders
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
|
||||||
|
@ -2008,6 +2080,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
amd_shader_core_properties2 = true;
|
amd_shader_core_properties2 = true;
|
||||||
} else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
|
} else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
|
||||||
pipeline_robustness = true;
|
pipeline_robustness = true;
|
||||||
|
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
||||||
|
device->subgroup_size_control = true;
|
||||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||||
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
||||||
device->coopmat_support = true;
|
device->coopmat_support = true;
|
||||||
|
@ -2028,6 +2102,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
|
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
|
||||||
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
|
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
|
||||||
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
||||||
|
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||||
|
|
||||||
props2.pNext = &props3;
|
props2.pNext = &props3;
|
||||||
props3.pNext = &subgroup_props;
|
props3.pNext = &subgroup_props;
|
||||||
subgroup_props.pNext = &driver_props;
|
subgroup_props.pNext = &driver_props;
|
||||||
|
@ -2047,6 +2123,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
||||||
last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
||||||
}
|
}
|
||||||
|
if (device->subgroup_size_control) {
|
||||||
|
last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props;
|
||||||
|
last_struct = (VkBaseOutStructure *)&subgroup_size_control_props;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(VK_NV_cooperative_matrix2)
|
#if defined(VK_NV_cooperative_matrix2)
|
||||||
vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
|
vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
|
||||||
|
@ -2085,7 +2165,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
|
|
||||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||||
|
|
||||||
if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
|
if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
|
||||||
// Intel drivers don't support coopmat properly yet
|
// Intel drivers don't support coopmat properly yet
|
||||||
// Only RADV supports coopmat properly on AMD
|
// Only RADV supports coopmat properly on AMD
|
||||||
device->coopmat_support = false;
|
device->coopmat_support = false;
|
||||||
|
@ -2141,6 +2221,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
device_extensions.push_back("VK_EXT_pipeline_robustness");
|
device_extensions.push_back("VK_EXT_pipeline_robustness");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
|
||||||
|
subgroup_size_control_features.pNext = nullptr;
|
||||||
|
subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
|
||||||
|
subgroup_size_control_features.computeFullSubgroups = false;
|
||||||
|
subgroup_size_control_features.subgroupSizeControl = false;
|
||||||
|
|
||||||
|
if (device->subgroup_size_control) {
|
||||||
|
last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||||
|
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||||
|
}
|
||||||
|
|
||||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||||
coopmat_features.pNext = nullptr;
|
coopmat_features.pNext = nullptr;
|
||||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||||
|
@ -2168,6 +2259,20 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
|
|
||||||
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
|
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
|
||||||
|
|
||||||
|
if (device->subgroup_size_control) {
|
||||||
|
device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
|
||||||
|
device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
device->subgroup_size_control = device->subgroup_size_control &&
|
||||||
|
(subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) &&
|
||||||
|
subgroup_size_control_features.subgroupSizeControl;
|
||||||
|
|
||||||
|
if (device->subgroup_size_control) {
|
||||||
|
device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups;
|
||||||
|
device_extensions.push_back("VK_EXT_subgroup_size_control");
|
||||||
|
}
|
||||||
|
|
||||||
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
||||||
|
|
||||||
if (coopmat2_support) {
|
if (coopmat2_support) {
|
||||||
|
@ -2317,7 +2422,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (device->coopmat_m == 0) {
|
if (device->coopmat_m == 0 || !device->coopmat_acc_f32_support) {
|
||||||
// No suitable matmul mode found
|
// No suitable matmul mode found
|
||||||
GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n");
|
GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n");
|
||||||
device->coopmat_support = false;
|
device->coopmat_support = false;
|
||||||
|
@ -2450,7 +2555,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
|
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource))) {
|
||||||
// Intel drivers don't support coopmat properly yet
|
// Intel drivers don't support coopmat properly yet
|
||||||
// Only RADV supports coopmat properly on AMD
|
// Only RADV supports coopmat properly on AMD
|
||||||
coopmat_support = false;
|
coopmat_support = false;
|
||||||
|
@ -2737,7 +2842,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
||||||
return ctx->device->pipeline_matmul_f32_f16;
|
return ctx->device->pipeline_matmul_f32_f16;
|
||||||
}
|
}
|
||||||
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) {
|
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
|
||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_matmul_f16_f32.f16acc;
|
return ctx->device->pipeline_matmul_f16_f32.f16acc;
|
||||||
}
|
}
|
||||||
|
@ -2812,7 +2917,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_matmul_id_f32;
|
return ctx->device->pipeline_matmul_id_f32;
|
||||||
}
|
}
|
||||||
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) {
|
if (prec == GGML_PREC_DEFAULT && ctx->device->fp16 && !(ctx->device->coopmat_support && !ctx->device->coopmat_acc_f16_support)) {
|
||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_matmul_id_f16_f32.f16acc;
|
return ctx->device->pipeline_matmul_id_f16_f32.f16acc;
|
||||||
}
|
}
|
||||||
|
@ -7696,7 +7801,16 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
|
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
|
{
|
||||||
|
const int mode = ((const int32_t *) op->op_params)[2];
|
||||||
|
if (mode & GGML_ROPE_TYPE_MROPE) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (mode & GGML_ROPE_TYPE_VISION) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
|
}
|
||||||
case GGML_OP_NONE:
|
case GGML_OP_NONE:
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
|
|
|
@ -25,92 +25,94 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
|
|
||||||
#if defined(DATA_A_Q4_0)
|
#if defined(DATA_A_Q4_0)
|
||||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[a_offset + ib].d);
|
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
|
return (vec2(vui & 0xF, vui >> 4) - 8.0f);
|
||||||
}
|
}
|
||||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a_packed16[a_offset + ib].d);
|
|
||||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) - 8.0f) * d;
|
return (vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12) - 8.0f);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q4_1)
|
#if defined(DATA_A_Q4_1)
|
||||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[a_offset + ib].d);
|
|
||||||
const float m = float(data_a[a_offset + ib].m);
|
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2(vui & 0xF, vui >> 4) * d + m;
|
return vec2(vui & 0xF, vui >> 4);
|
||||||
}
|
}
|
||||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a_packed16[a_offset + ib].d);
|
|
||||||
const float m = float(data_a_packed16[a_offset + ib].m);
|
|
||||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, (vui >> 12) & 0xF) * d + m;
|
return vec4(vui & 0xF, (vui >> 4) & 0xF, (vui >> 8) & 0xF, vui >> 12);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q5_0)
|
#if defined(DATA_A_Q5_0)
|
||||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[a_offset + ib].d);
|
|
||||||
const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
|
const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0];
|
||||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
|
return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f);
|
||||||
}
|
}
|
||||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a_packed16[a_offset + ib].d);
|
|
||||||
const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
|
const uint uint_qh = uint(data_a_packed16[a_offset + ib].qh[1]) << 16 | data_a_packed16[a_offset + ib].qh[0];
|
||||||
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||||
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
|
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
|
||||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
return (vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) - 16.0f) * d;
|
return (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q5_1)
|
#if defined(DATA_A_Q5_1)
|
||||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[a_offset + ib].d);
|
|
||||||
const float m = float(data_a[a_offset + ib].m);
|
|
||||||
const uint uint_qh = data_a[a_offset + ib].qh;
|
const uint uint_qh = data_a[a_offset + ib].qh;
|
||||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
|
return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y);
|
||||||
}
|
}
|
||||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a_packed16[a_offset + ib].d);
|
|
||||||
const float m = float(data_a_packed16[a_offset + ib].m);
|
|
||||||
const uint uint_qh = data_a_packed16[a_offset + ib].qh;
|
const uint uint_qh = data_a_packed16[a_offset + ib].qh;
|
||||||
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
const ivec2 qh0 = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
||||||
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
|
const ivec2 qh1 = ivec2(((uint_qh >> (iqs + 1)) << 4) & 0x10, (uint_qh >> (iqs + 13)) & 0x10);
|
||||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
return vec4(((vui >> 0) & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, ((vui >> 12) & 0xF) | qh1.y) * d + m;
|
return vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_Q8_0)
|
#if defined(DATA_A_Q8_0)
|
||||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[a_offset + ib].d);
|
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
|
||||||
return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
|
|
||||||
}
|
}
|
||||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a_packed16[a_offset + ib].d);
|
|
||||||
uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
|
uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
|
||||||
uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
|
uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
|
||||||
return vec4(int8_t(v0 & 0xFF), int8_t((v0 >> 8) & 0xFF), int8_t(v1 & 0xFF), int8_t((v1 >> 8) & 0xFF)) * d;
|
return vec4(int8_t(v0 & 0xFF), int8_t(v0 >> 8), int8_t(v1 & 0xFF), int8_t(v1 >> 8));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(DATA_A_IQ4_NL)
|
#if defined(DATA_A_IQ4_NL)
|
||||||
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
vec2 dequantize(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a[a_offset + ib].d);
|
|
||||||
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
|
||||||
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]);
|
||||||
}
|
}
|
||||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
|
||||||
const float d = float(data_a_packed16[a_offset + ib].d);
|
|
||||||
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
const uint vui = uint(data_a_packed16[a_offset + ib].qs[iqs/2]);
|
||||||
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[(vui >> 12) & 0xF]) * d;
|
return vec4(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[(vui >> 4) & 0xF], kvalues_iq4nl[(vui >> 8) & 0xF], kvalues_iq4nl[vui >> 12]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(DATA_A_F32) || defined(DATA_A_F16)
|
||||||
|
vec2 get_dm(uint ib, uint a_offset) {
|
||||||
|
return vec2(0, 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q5_0) || defined(DATA_A_Q8_0) || defined(DATA_A_IQ4_NL)
|
||||||
|
vec2 get_dm(uint ib, uint a_offset) {
|
||||||
|
return vec2(float(data_a[a_offset + ib].d), 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(DATA_A_Q4_1) || defined(DATA_A_Q5_1)
|
||||||
|
vec2 get_dm(uint ib, uint a_offset) {
|
||||||
|
return vec2(float(data_a[a_offset + ib].d), float(data_a[a_offset + ib].m));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -9,8 +9,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||||
const uint i = gl_WorkGroupID.x * 256 + wgy;
|
const uint ib = gl_WorkGroupID.x * 256 + wgy;
|
||||||
if (i >= p.M * p.K / QUANT_K) {
|
if (ib >= p.M * p.K / QUANT_K) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,37 +20,49 @@ void main() {
|
||||||
const uint is = 2 * il;
|
const uint is = 2 * il;
|
||||||
const uint n = 4;
|
const uint n = 4;
|
||||||
|
|
||||||
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
|
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
|
||||||
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
|
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
|
||||||
|
|
||||||
const uint y_idx = i * QUANT_K + 64 * il + n * ir;
|
const uint y_idx = ib * QUANT_K + 64 * il + n * ir;
|
||||||
const uint qs_idx = 32*il + n * ir;
|
const uint qs_idx = 32*il + n * ir;
|
||||||
|
|
||||||
uint8_t sc;
|
uint scidx0 = (is < 4) ? is : (is + 4);
|
||||||
uint8_t m;
|
uint scidx1 = (is < 4) ? is : (is - 4);
|
||||||
if (is < 4) {
|
uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
sc = uint8_t(data_a[i].scales[is] & 63);
|
uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||||
m = uint8_t(data_a[i].scales[is + 4] & 63);
|
uint mbidx0 = is + 4;
|
||||||
} else {
|
uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||||
sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
|
uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||||
m = uint8_t((data_a[i].scales[is + 4] >> 4) | ((data_a[i].scales[is ] >> 6) << 4));
|
uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||||
}
|
uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
const FLOAT_TYPE d1 = dall * sc;
|
uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||||
const FLOAT_TYPE m1 = dmin * m;
|
|
||||||
|
uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||||
|
uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||||
|
|
||||||
|
const FLOAT_TYPE d1 = dall * sc;
|
||||||
|
const FLOAT_TYPE m1 = dmin * mbyte;
|
||||||
|
|
||||||
|
scidx0 = (is < 4) ? is + 1 : (is + 5);
|
||||||
|
scidx1 = (is < 4) ? is + 1 : (is - 3);
|
||||||
|
scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
|
scidxshift1 = (is < 4) ? 0 : 2;
|
||||||
|
mbidx0 = is + 5;
|
||||||
|
mbidx1 = (is < 4) ? is + 5 : is + 1;
|
||||||
|
mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||||
|
mbidxshift0 = (is < 4) ? 0 : 4;
|
||||||
|
mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
|
mbidxshift1 = (is < 4) ? 0 : 2;
|
||||||
|
|
||||||
|
sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||||
|
mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||||
|
|
||||||
if (is < 4) {
|
|
||||||
sc = uint8_t(data_a[i].scales[is + 1] & 63);
|
|
||||||
m = uint8_t(data_a[i].scales[is + 5] & 63);
|
|
||||||
} else {
|
|
||||||
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
|
|
||||||
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
|
|
||||||
}
|
|
||||||
const FLOAT_TYPE d2 = dall * sc;
|
const FLOAT_TYPE d2 = dall * sc;
|
||||||
const FLOAT_TYPE m2 = dmin * m;
|
const FLOAT_TYPE m2 = dmin * mbyte;
|
||||||
|
|
||||||
[[unroll]] for (uint l = 0; l < n; ++l) {
|
[[unroll]] for (uint l = 0; l < n; ++l) {
|
||||||
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] & 0xF) - m1);
|
data_b[y_idx + l ] = D_TYPE(d1 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] & 0xF) - m1);
|
||||||
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[i].qs[qs_idx + l] >> 4) - m2);
|
data_b[y_idx + l + 32] = D_TYPE(d2 * FLOAT_TYPE(data_a[ib].qs[qs_idx + l] >> 4) - m2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,8 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
[[unroll]] for (uint wgy = 0; wgy < 256; wgy++) {
|
||||||
const uint i = gl_WorkGroupID.x * 256 + wgy;
|
const uint ib = gl_WorkGroupID.x * 256 + wgy;
|
||||||
if (i >= p.M * p.K / QUANT_K) {
|
if (ib >= p.M * p.K / QUANT_K) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,40 +19,52 @@ void main() {
|
||||||
const uint ir = tid % 16;
|
const uint ir = tid % 16;
|
||||||
const uint is = 2 * il;
|
const uint is = 2 * il;
|
||||||
|
|
||||||
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
|
const FLOAT_TYPE dall = FLOAT_TYPE(data_a[ib].d.x);
|
||||||
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
|
const FLOAT_TYPE dmin = FLOAT_TYPE(data_a[ib].d.y);
|
||||||
|
|
||||||
const uint y_idx = i * QUANT_K + 64 * il + 2 * ir;
|
const uint y_idx = ib * QUANT_K + 64 * il + 2 * ir;
|
||||||
const uint qs_idx = 32*il + 2 * ir;
|
const uint qs_idx = 32*il + 2 * ir;
|
||||||
const uint qh_idx = 2 * ir;
|
const uint qh_idx = 2 * ir;
|
||||||
|
|
||||||
uint8_t sc;
|
uint scidx0 = (is < 4) ? is : (is + 4);
|
||||||
uint8_t m;
|
uint scidx1 = (is < 4) ? is : (is - 4);
|
||||||
if (is < 4) {
|
uint scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
sc = uint8_t(data_a[i].scales[is] & 63);
|
uint scidxshift1 = (is < 4) ? 0 : 2;
|
||||||
m = uint8_t(data_a[i].scales[is + 4] & 63);
|
uint mbidx0 = is + 4;
|
||||||
} else {
|
uint mbidx1 = (is < 4) ? is + 4 : is;
|
||||||
sc = uint8_t((data_a[i].scales[is + 4] & 0xF) | ((data_a[i].scales[is - 4] >> 6) << 4));
|
uint mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||||
m = uint8_t((data_a[i].scales[is + 4] >> 4) | ((data_a[i].scales[is ] >> 6) << 4));
|
uint mbidxshift0 = (is < 4) ? 0 : 4;
|
||||||
}
|
uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
const FLOAT_TYPE d1 = dall * sc;
|
uint mbidxshift1 = (is < 4) ? 0 : 2;
|
||||||
const FLOAT_TYPE m1 = dmin * m;
|
|
||||||
|
uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||||
|
uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||||
|
|
||||||
|
const FLOAT_TYPE d1 = dall * sc;
|
||||||
|
const FLOAT_TYPE m1 = dmin * mbyte;
|
||||||
|
|
||||||
|
scidx0 = (is < 4) ? is + 1 : (is + 5);
|
||||||
|
scidx1 = (is < 4) ? is + 1 : (is - 3);
|
||||||
|
scidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
|
scidxshift1 = (is < 4) ? 0 : 2;
|
||||||
|
mbidx0 = is + 5;
|
||||||
|
mbidx1 = (is < 4) ? is + 5 : is + 1;
|
||||||
|
mbidxmask0 = (is < 4) ? 0xF : 0xF0;
|
||||||
|
mbidxshift0 = (is < 4) ? 0 : 4;
|
||||||
|
mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
|
||||||
|
mbidxshift1 = (is < 4) ? 0 : 2;
|
||||||
|
|
||||||
|
sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1));
|
||||||
|
mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1));
|
||||||
|
|
||||||
if (is < 4) {
|
|
||||||
sc = uint8_t(data_a[i].scales[is + 1] & 63);
|
|
||||||
m = uint8_t(data_a[i].scales[is + 5] & 63);
|
|
||||||
} else {
|
|
||||||
sc = uint8_t((data_a[i].scales[is + 5] & 0xF) | ((data_a[i].scales[is - 3] >> 6) << 4));
|
|
||||||
m = uint8_t((data_a[i].scales[is + 5] >> 4) | ((data_a[i].scales[is + 1] >> 6) << 4));
|
|
||||||
}
|
|
||||||
const FLOAT_TYPE d2 = dall * sc;
|
const FLOAT_TYPE d2 = dall * sc;
|
||||||
const FLOAT_TYPE m2 = dmin * m;
|
const FLOAT_TYPE m2 = dmin * mbyte;
|
||||||
|
|
||||||
const uint8_t hm1 = uint8_t(1 << (2 * il ));
|
const uint8_t hm1 = uint8_t(1 << (2 * il ));
|
||||||
const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
|
const uint8_t hm2 = uint8_t(1 << (2 * il + 1));
|
||||||
data_b[y_idx ] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx ] & 0xF) + (((data_a[i].qh[qh_idx ] & hm1) != 0) ? 16 : 0)) - m1);
|
data_b[y_idx ] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx ] & 0xF) + (((data_a[ib].qh[qh_idx ] & hm1) != 0) ? 16 : 0)) - m1);
|
||||||
data_b[y_idx + 1] = D_TYPE(d1 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] & 0xF) + (((data_a[i].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
|
data_b[y_idx + 1] = D_TYPE(d1 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] & 0xF) + (((data_a[ib].qh[qh_idx + 1] & hm1) != 0) ? 16 : 0)) - m1);
|
||||||
data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx ] >> 4) + (((data_a[i].qh[qh_idx ] & hm2) != 0) ? 16 : 0)) - m2);
|
data_b[y_idx + 32] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx ] >> 4) + (((data_a[ib].qh[qh_idx ] & hm2) != 0) ? 16 : 0)) - m2);
|
||||||
data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[i].qs[qs_idx + 1] >> 4) + (((data_a[i].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
|
data_b[y_idx + 33] = D_TYPE(d2 * FLOAT_TYPE((data_a[ib].qs[qs_idx + 1] >> 4) + (((data_a[ib].qh[qh_idx + 1] & hm2) != 0) ? 16 : 0)) - m2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,8 @@ void main() {
|
||||||
const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2;
|
||||||
|
|
||||||
vec2 v = dequantize(ib, iqs, 0);
|
vec2 v = dequantize(ib, iqs, 0);
|
||||||
|
const vec2 dm = get_dm(ib, 0);
|
||||||
|
v = v * dm.x + dm.y;
|
||||||
|
|
||||||
data_d[d_offset + iybs + iqs ] = D_TYPE(v.x);
|
data_d[d_offset + iybs + iqs ] = D_TYPE(v.x);
|
||||||
data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
|
data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y);
|
||||||
|
|
|
@ -31,27 +31,13 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
|
||||||
|
|
||||||
#if K_PER_ITER == 8
|
#if K_PER_ITER == 8
|
||||||
#if QUANT_R == 2
|
#if QUANT_R == 2
|
||||||
B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
|
const B_TYPE_VEC4 bv02 = data_b_v4[(b_offset + iybs + iqs) / 4];
|
||||||
B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
|
const B_TYPE_VEC4 bv13 = data_b_v4[(b_offset + iybs + iqs + y_offset) / 4];
|
||||||
FLOAT_TYPE b0 = FLOAT_TYPE(bv02.x);
|
const vec4 bv0 = vec4(bv02.x, bv13.x, bv02.y, bv13.y);
|
||||||
FLOAT_TYPE b1 = FLOAT_TYPE(bv13.x);
|
const vec4 bv1 = vec4(bv02.z, bv13.z, bv02.w, bv13.w);
|
||||||
FLOAT_TYPE b2 = FLOAT_TYPE(bv02.y);
|
|
||||||
FLOAT_TYPE b3 = FLOAT_TYPE(bv13.y);
|
|
||||||
FLOAT_TYPE b4 = FLOAT_TYPE(bv02.z);
|
|
||||||
FLOAT_TYPE b5 = FLOAT_TYPE(bv13.z);
|
|
||||||
FLOAT_TYPE b6 = FLOAT_TYPE(bv02.w);
|
|
||||||
FLOAT_TYPE b7 = FLOAT_TYPE(bv13.w);
|
|
||||||
#else
|
#else
|
||||||
B_TYPE_VEC4 bv0 = data_b_v4[(b_offset + iybs + iqs) / 4];
|
const vec4 bv0 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4]);
|
||||||
B_TYPE_VEC4 bv1 = data_b_v4[(b_offset + iybs + iqs) / 4 + 1];
|
const vec4 bv1 = vec4(data_b_v4[(b_offset + iybs + iqs) / 4 + 1]);
|
||||||
FLOAT_TYPE b0 = FLOAT_TYPE(bv0.x);
|
|
||||||
FLOAT_TYPE b1 = FLOAT_TYPE(bv0.y);
|
|
||||||
FLOAT_TYPE b2 = FLOAT_TYPE(bv0.z);
|
|
||||||
FLOAT_TYPE b3 = FLOAT_TYPE(bv0.w);
|
|
||||||
FLOAT_TYPE b4 = FLOAT_TYPE(bv1.x);
|
|
||||||
FLOAT_TYPE b5 = FLOAT_TYPE(bv1.y);
|
|
||||||
FLOAT_TYPE b6 = FLOAT_TYPE(bv1.z);
|
|
||||||
FLOAT_TYPE b7 = FLOAT_TYPE(bv1.w);
|
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
// Check if the second of the pair of elements is OOB, and don't fetch B or
|
||||||
|
@ -67,22 +53,29 @@ void iter(inout FLOAT_TYPE temp[NUM_ROWS], const uint first_row, const uint num_
|
||||||
b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
|
b1 = FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
uint ibi = first_row*p.ncols;
|
||||||
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
|
||||||
const uint ib = ((first_row + n)*p.ncols + col)/QUANT_K; // block index
|
const uint ib = (ibi + col)/QUANT_K; // block index
|
||||||
|
ibi += p.ncols;
|
||||||
|
|
||||||
#if K_PER_ITER == 8
|
#if K_PER_ITER == 8
|
||||||
const vec4 v = dequantize4(ib, iqs, a_offset);
|
vec4 v = dequantize4(ib, iqs, a_offset);
|
||||||
const vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
|
vec4 v2 = dequantize4(ib, iqs+(4/QUANT_R), a_offset);
|
||||||
|
|
||||||
|
const vec2 dm = get_dm(ib, a_offset);
|
||||||
|
if (dm.y != 0) { // quant has min component
|
||||||
|
v = v * dm.x + dm.y;
|
||||||
|
v2 = v2 * dm.x + dm.y;
|
||||||
|
}
|
||||||
|
|
||||||
// matrix multiplication
|
// matrix multiplication
|
||||||
temp[n] = fma(FLOAT_TYPE(v.x), b0, temp[n]);
|
FLOAT_TYPE rowtmp = dot(bv0, v);
|
||||||
temp[n] = fma(FLOAT_TYPE(v.y), b1, temp[n]);
|
rowtmp += dot(bv1, v2);
|
||||||
temp[n] = fma(FLOAT_TYPE(v.z), b2, temp[n]);
|
|
||||||
temp[n] = fma(FLOAT_TYPE(v.w), b3, temp[n]);
|
if (dm.y == 0)
|
||||||
temp[n] = fma(FLOAT_TYPE(v2.x), b4, temp[n]);
|
rowtmp *= dm.x;
|
||||||
temp[n] = fma(FLOAT_TYPE(v2.y), b5, temp[n]);
|
|
||||||
temp[n] = fma(FLOAT_TYPE(v2.z), b6, temp[n]);
|
temp[n] += rowtmp;
|
||||||
temp[n] = fma(FLOAT_TYPE(v2.w), b7, temp[n]);
|
|
||||||
#else
|
#else
|
||||||
const vec2 v = dequantize(ib, iqs, a_offset);
|
const vec2 v = dequantize(ib, iqs, a_offset);
|
||||||
|
|
||||||
|
|
|
@ -3517,15 +3517,18 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||||
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int sections[4] = {0, 0, 0, 0};
|
||||||
|
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
||||||
memcpy(params + 5, &freq_base, sizeof(float));
|
memcpy(params + 5, &freq_base, sizeof(float));
|
||||||
memcpy(params + 6, &freq_scale, sizeof(float));
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
||||||
memcpy(params + 7, &ext_factor, sizeof(float));
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
||||||
memcpy(params + 8, &attn_factor, sizeof(float));
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
||||||
memcpy(params + 9, &beta_fast, sizeof(float));
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
||||||
memcpy(params + 10, &beta_slow, sizeof(float));
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
||||||
|
memcpy(params + 11, §ions, sizeof(int)*4);
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_ROPE;
|
result->op = GGML_OP_ROPE;
|
||||||
|
@ -3547,6 +3550,53 @@ struct ggml_tensor * ggml_rope(
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_rope_multi(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c,
|
||||||
|
int n_dims,
|
||||||
|
int sections[4],
|
||||||
|
int mode,
|
||||||
|
int n_ctx_orig,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow) {
|
||||||
|
// Multimodal Rotary Position Embedding
|
||||||
|
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_vector(b));
|
||||||
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||||
|
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
||||||
|
|
||||||
|
if (c) {
|
||||||
|
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
||||||
|
memcpy(params + 5, &freq_base, sizeof(float));
|
||||||
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
||||||
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
||||||
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
||||||
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
||||||
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
||||||
|
memcpy(¶ms[11], sections, sizeof(int)*4);
|
||||||
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
|
result->op = GGML_OP_ROPE;
|
||||||
|
result->src[0] = a;
|
||||||
|
result->src[1] = b;
|
||||||
|
result->src[2] = c;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rope_inplace(
|
struct ggml_tensor * ggml_rope_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
|
@ -131,6 +131,7 @@ class Keys:
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
||||||
FREQ_BASE = "{arch}.rope.freq_base"
|
FREQ_BASE = "{arch}.rope.freq_base"
|
||||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||||
|
@ -226,6 +227,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
QWEN2 = auto()
|
QWEN2 = auto()
|
||||||
QWEN2MOE = auto()
|
QWEN2MOE = auto()
|
||||||
|
QWEN2VL = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
PHI3 = auto()
|
PHI3 = auto()
|
||||||
PLAMO = auto()
|
PLAMO = auto()
|
||||||
|
@ -388,6 +390,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
MODEL_ARCH.QWEN2: "qwen2",
|
MODEL_ARCH.QWEN2: "qwen2",
|
||||||
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
||||||
|
MODEL_ARCH.QWEN2VL: "qwen2vl",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
MODEL_ARCH.PHI3: "phi3",
|
MODEL_ARCH.PHI3: "phi3",
|
||||||
MODEL_ARCH.PLAMO: "plamo",
|
MODEL_ARCH.PLAMO: "plamo",
|
||||||
|
@ -772,6 +775,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.QWEN2VL: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
MODEL_ARCH.QWEN2MOE: [
|
MODEL_ARCH.QWEN2MOE: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
|
|
@ -145,11 +145,10 @@ class GGUFReader:
|
||||||
count = int(count)
|
count = int(count)
|
||||||
itemsize = int(np.empty([], dtype = dtype).itemsize)
|
itemsize = int(np.empty([], dtype = dtype).itemsize)
|
||||||
end_offs = offset + itemsize * count
|
end_offs = offset + itemsize * count
|
||||||
return (
|
arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
|
||||||
self.data[offset:end_offs]
|
if override_order is None:
|
||||||
.view(dtype = dtype)[:count]
|
return arr
|
||||||
.newbyteorder(override_order or self.byte_order)
|
return arr.view(arr.dtype.newbyteorder(override_order))
|
||||||
)
|
|
||||||
|
|
||||||
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
|
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
|
||||||
if field.name in self.fields:
|
if field.name in self.fields:
|
||||||
|
|
|
@ -751,6 +751,9 @@ class GGUFWriter:
|
||||||
def add_rope_dimension_count(self, count: int) -> None:
|
def add_rope_dimension_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
|
||||||
|
self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
|
||||||
|
|
||||||
def add_rope_freq_base(self, value: float) -> None:
|
def add_rope_freq_base(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
|
self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
|
|
@ -111,6 +111,8 @@ extern "C" {
|
||||||
LLAMA_ROPE_TYPE_NONE = -1,
|
LLAMA_ROPE_TYPE_NONE = -1,
|
||||||
LLAMA_ROPE_TYPE_NORM = 0,
|
LLAMA_ROPE_TYPE_NORM = 0,
|
||||||
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
|
||||||
|
LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
|
||||||
|
LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
||||||
|
|
|
@ -1,10 +1,3 @@
|
||||||
# TODO: should not use this
|
|
||||||
if (WIN32)
|
|
||||||
if (BUILD_SHARED_LIBS)
|
|
||||||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
llama_add_compile_flags()
|
llama_add_compile_flags()
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
174
src/llama.cpp
174
src/llama.cpp
|
@ -163,6 +163,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_QWEN,
|
LLM_ARCH_QWEN,
|
||||||
LLM_ARCH_QWEN2,
|
LLM_ARCH_QWEN2,
|
||||||
LLM_ARCH_QWEN2MOE,
|
LLM_ARCH_QWEN2MOE,
|
||||||
|
LLM_ARCH_QWEN2VL,
|
||||||
LLM_ARCH_PHI2,
|
LLM_ARCH_PHI2,
|
||||||
LLM_ARCH_PHI3,
|
LLM_ARCH_PHI3,
|
||||||
LLM_ARCH_PLAMO,
|
LLM_ARCH_PLAMO,
|
||||||
|
@ -217,6 +218,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_QWEN, "qwen" },
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
{ LLM_ARCH_QWEN2, "qwen2" },
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
||||||
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
||||||
|
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
||||||
{ LLM_ARCH_PHI2, "phi2" },
|
{ LLM_ARCH_PHI2, "phi2" },
|
||||||
{ LLM_ARCH_PHI3, "phi3" },
|
{ LLM_ARCH_PHI3, "phi3" },
|
||||||
{ LLM_ARCH_PLAMO, "plamo" },
|
{ LLM_ARCH_PLAMO, "plamo" },
|
||||||
|
@ -308,6 +310,7 @@ enum llm_kv {
|
||||||
LLM_KV_ATTENTION_SCALE,
|
LLM_KV_ATTENTION_SCALE,
|
||||||
|
|
||||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||||
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||||
LLM_KV_ROPE_FREQ_BASE,
|
LLM_KV_ROPE_FREQ_BASE,
|
||||||
LLM_KV_ROPE_SCALE_LINEAR,
|
LLM_KV_ROPE_SCALE_LINEAR,
|
||||||
LLM_KV_ROPE_SCALING_TYPE,
|
LLM_KV_ROPE_SCALING_TYPE,
|
||||||
|
@ -424,6 +427,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||||
|
|
||||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||||
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||||
|
@ -898,6 +902,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_QWEN2VL,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_QWEN2MOE,
|
LLM_ARCH_QWEN2MOE,
|
||||||
{
|
{
|
||||||
|
@ -1794,7 +1815,7 @@ private:
|
||||||
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||||
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
||||||
if (!bufLen) {
|
if (!bufLen) {
|
||||||
ret = format("Win32 error code: %s", error_code);
|
ret = format("Win32 error code: %lx", error_code);
|
||||||
} else {
|
} else {
|
||||||
ret = lpMsgBuf;
|
ret = lpMsgBuf;
|
||||||
LocalFree(lpMsgBuf);
|
LocalFree(lpMsgBuf);
|
||||||
|
@ -2132,7 +2153,7 @@ struct llama_mmap {
|
||||||
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
||||||
|
|
||||||
// may fail on pre-Windows 8 systems
|
// may fail on pre-Windows 8 systems
|
||||||
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
|
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
|
||||||
|
|
||||||
if (pPrefetchVirtualMemory) {
|
if (pPrefetchVirtualMemory) {
|
||||||
// advise the kernel to preload the mapped memory
|
// advise the kernel to preload the mapped memory
|
||||||
|
@ -2479,6 +2500,7 @@ struct llama_hparams {
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
uint32_t n_ctx_orig_yarn;
|
uint32_t n_ctx_orig_yarn;
|
||||||
float rope_yarn_log_mul;
|
float rope_yarn_log_mul;
|
||||||
|
int rope_sections[4];
|
||||||
|
|
||||||
// for State Space Models
|
// for State Space Models
|
||||||
uint32_t ssm_d_conv = 0;
|
uint32_t ssm_d_conv = 0;
|
||||||
|
@ -2535,6 +2557,9 @@ struct llama_hparams {
|
||||||
|
|
||||||
if (this->rope_finetuned != other.rope_finetuned) return true;
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
||||||
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
||||||
|
if (std::equal(std::begin(this->rope_sections),
|
||||||
|
std::end(this->rope_sections),
|
||||||
|
std::begin(other.rope_sections))) return true;
|
||||||
|
|
||||||
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
||||||
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
||||||
|
@ -3378,6 +3403,11 @@ struct llama_context {
|
||||||
// whether we are computing encoder output or decoder output
|
// whether we are computing encoder output or decoder output
|
||||||
bool is_encoding = false;
|
bool is_encoding = false;
|
||||||
|
|
||||||
|
// TODO: find a better way to accommodate mutli-dimension position encoding methods
|
||||||
|
// number of position id each token get, 1 for each token in most cases.
|
||||||
|
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
|
||||||
|
int n_pos_per_token = 1;
|
||||||
|
|
||||||
// output of the encoder part of the encoder-decoder models
|
// output of the encoder part of the encoder-decoder models
|
||||||
std::vector<float> embd_enc;
|
std::vector<float> embd_enc;
|
||||||
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
||||||
|
@ -5747,6 +5777,13 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_QWEN2VL:
|
||||||
|
{
|
||||||
|
std::array<int, 4> section_dims;
|
||||||
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
|
||||||
|
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
@ -8167,6 +8204,7 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
|
case LLM_ARCH_QWEN2VL:
|
||||||
{
|
{
|
||||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||||
|
|
||||||
|
@ -12556,6 +12594,124 @@ struct llm_build_context {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_qwen2vl() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
|
||||||
|
cb(lctx.inp_pos, "inp_pos", -1);
|
||||||
|
ggml_set_input(lctx.inp_pos);
|
||||||
|
struct ggml_tensor * inp_pos = lctx.inp_pos;
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
int sections[4];
|
||||||
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_multi(
|
||||||
|
ctx0,
|
||||||
|
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_multi(
|
||||||
|
ctx0,
|
||||||
|
ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = llm_build_ffn(ctx0, lctx, cur,
|
||||||
|
model.layers[il].ffn_up, NULL, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_qwen2moe() {
|
struct ggml_cgraph * build_qwen2moe() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
|
||||||
|
@ -16657,6 +16813,11 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_qwen2();
|
result = llm.build_qwen2();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_QWEN2VL:
|
||||||
|
{
|
||||||
|
lctx.n_pos_per_token = 4;
|
||||||
|
result = llm.build_qwen2vl();
|
||||||
|
} break;
|
||||||
case LLM_ARCH_QWEN2MOE:
|
case LLM_ARCH_QWEN2MOE:
|
||||||
{
|
{
|
||||||
result = llm.build_qwen2moe();
|
result = llm.build_qwen2moe();
|
||||||
|
@ -16875,8 +17036,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
||||||
|
|
||||||
if (ubatch.pos && lctx.inp_pos) {
|
if (ubatch.pos && lctx.inp_pos) {
|
||||||
const int64_t n_tokens = ubatch.n_tokens;
|
const int64_t n_tokens = ubatch.n_tokens;
|
||||||
|
auto n_pos = lctx.n_pos_per_token;
|
||||||
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
@ -20009,6 +20170,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
return LLAMA_ROPE_TYPE_NEOX;
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
|
||||||
|
case LLM_ARCH_QWEN2VL:
|
||||||
|
return LLAMA_ROPE_TYPE_MROPE;
|
||||||
|
|
||||||
// all model arches should be listed explicitly here
|
// all model arches should be listed explicitly here
|
||||||
case LLM_ARCH_UNKNOWN:
|
case LLM_ARCH_UNKNOWN:
|
||||||
GGML_ABORT("unknown architecture");
|
GGML_ABORT("unknown architecture");
|
||||||
|
@ -21577,7 +21741,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
||||||
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
||||||
}
|
}
|
||||||
} else if ((size_t) i >= ctx->output_ids.size()) {
|
} else if ((size_t) i >= ctx->output_ids.size()) {
|
||||||
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
|
throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
|
||||||
} else {
|
} else {
|
||||||
j = ctx->output_ids[i];
|
j = ctx->output_ids[i];
|
||||||
}
|
}
|
||||||
|
|
|
@ -84,6 +84,20 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
|
|
||||||
|
|
||||||
|
if (NOT WIN32)
|
||||||
|
# these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
|
||||||
|
llama_target_and_test(test-sampling.cpp)
|
||||||
|
llama_target_and_test(test-grammar-parser.cpp)
|
||||||
|
llama_target_and_test(test-grammar-integration.cpp)
|
||||||
|
llama_target_and_test(test-llama-grammar.cpp)
|
||||||
|
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||||
|
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||||
|
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||||
|
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
# build test-tokenizer-1-bpe target once and add many tests
|
# build test-tokenizer-1-bpe target once and add many tests
|
||||||
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
||||||
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
||||||
|
@ -108,14 +122,12 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
|
||||||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
||||||
# llama_target_and_test(test-double-float.cpp) # SLOW
|
# llama_target_and_test(test-double-float.cpp) # SLOW
|
||||||
|
endif()
|
||||||
|
|
||||||
llama_target_and_test(test-log.cpp)
|
llama_target_and_test(test-log.cpp)
|
||||||
llama_target_and_test(test-arg-parser.cpp)
|
llama_target_and_test(test-arg-parser.cpp)
|
||||||
llama_target_and_test(test-sampling.cpp)
|
|
||||||
llama_target_and_test(test-chat-template.cpp)
|
llama_target_and_test(test-chat-template.cpp)
|
||||||
|
|
||||||
llama_target_and_test(test-grammar-parser.cpp)
|
|
||||||
llama_target_and_test(test-grammar-integration.cpp)
|
|
||||||
llama_target_and_test(test-llama-grammar.cpp)
|
|
||||||
# llama_target_and_test(test-opt.cpp) # SLOW
|
# llama_target_and_test(test-opt.cpp) # SLOW
|
||||||
llama_target_and_test(test-backend-ops.cpp)
|
llama_target_and_test(test-backend-ops.cpp)
|
||||||
|
|
||||||
|
@ -130,11 +142,6 @@ if (NOT GGML_BACKEND_DL)
|
||||||
llama_target_and_test(test-rope.cpp)
|
llama_target_and_test(test-rope.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
|
||||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
|
||||||
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
|
||||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# dummy executable - not installed
|
# dummy executable - not installed
|
||||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
||||||
|
|
|
@ -2201,7 +2201,15 @@ struct test_rope : public test_case {
|
||||||
ggml_set_name(a, "a");
|
ggml_set_name(a, "a");
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
||||||
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
||||||
|
|
||||||
|
ggml_tensor * pos;
|
||||||
|
if (is_mrope || is_vision) {
|
||||||
|
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2] * 4);
|
||||||
|
} else {
|
||||||
|
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
||||||
|
}
|
||||||
ggml_set_name(pos, "pos");
|
ggml_set_name(pos, "pos");
|
||||||
|
|
||||||
ggml_tensor * freq = nullptr;
|
ggml_tensor * freq = nullptr;
|
||||||
|
@ -2210,7 +2218,20 @@ struct test_rope : public test_case {
|
||||||
ggml_set_name(freq, "freq");
|
ggml_set_name(freq, "freq");
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
ggml_tensor * out;
|
||||||
|
if (is_mrope) {
|
||||||
|
if (is_vision) {
|
||||||
|
GGML_ASSERT(n_dims/4 > 0);
|
||||||
|
int rope_sections[4] = {n_dims/4, n_dims/4, 0, 0}; // Vision-RoPE only use first two dimension for image (x, y) coordinate
|
||||||
|
out = ggml_rope_multi(ctx, a, pos, freq, n_dims/2, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(n_dims/3 > 0);
|
||||||
|
int rope_sections[4] = {n_dims/3, n_dims/3, n_dims/3, 0};
|
||||||
|
out = ggml_rope_multi(ctx, a, pos, freq, n_dims, rope_sections, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||||
|
}
|
||||||
ggml_set_name(out, "out");
|
ggml_set_name(out, "out");
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
|
@ -2220,11 +2241,12 @@ struct test_rope : public test_case {
|
||||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
if (t->type == GGML_TYPE_I32) {
|
if (t->type == GGML_TYPE_I32) {
|
||||||
// pos
|
// pos
|
||||||
std::vector<int> data(ne_a[2]);
|
const int num_pos_ids = (mode & GGML_ROPE_TYPE_MROPE) ? ne_a[2] * 4 : ne_a[2];
|
||||||
for (int i = 0; i < ne_a[2]; i++) {
|
std::vector<int> data(num_pos_ids);
|
||||||
|
for (int i = 0; i < num_pos_ids; i++) {
|
||||||
data[i] = rand() % n_ctx;
|
data[i] = rand() % n_ctx;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
|
ggml_backend_tensor_set(t, data.data(), 0, num_pos_ids * sizeof(int));
|
||||||
} else {
|
} else {
|
||||||
if (t->ne[0] == n_dims/2) {
|
if (t->ne[0] == n_dims/2) {
|
||||||
// frequency factors in the range [0.9f, 1.1f]
|
// frequency factors in the range [0.9f, 1.1f]
|
||||||
|
@ -3813,6 +3835,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 2, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (all) {
|
||||||
|
test_cases.emplace_back(new test_rope(type, {128, 12, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 2B)
|
||||||
|
test_cases.emplace_back(new test_rope(type, {128, 28, 2, 1}, 128, GGML_ROPE_TYPE_MROPE, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl 7B)
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 80, 16, 2, 1}, 80, GGML_ROPE_TYPE_VISION, 512, fs, ef, af, ff, v)); // rope_multi,m-rope (qwen2vl ViT)
|
||||||
|
}
|
||||||
|
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
test_cases.emplace_back(new test_rope(type, { 64, 128, 2, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -138,7 +138,7 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
||||||
struct ggml_tensor * x;
|
struct ggml_tensor * x;
|
||||||
|
|
||||||
// rope f32
|
// rope f32
|
||||||
for (int m = 0; m < 3; ++m) {
|
for (int m = 0; m < 5; ++m) {
|
||||||
const int ndims = 4;
|
const int ndims = 4;
|
||||||
|
|
||||||
const int64_t n_rot = 128;
|
const int64_t n_rot = 128;
|
||||||
|
@ -147,6 +147,13 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
||||||
const int n_past_0 = 100;
|
const int n_past_0 = 100;
|
||||||
const int n_past_2 = 33;
|
const int n_past_2 = 33;
|
||||||
|
|
||||||
|
struct ggml_tensor * r0;
|
||||||
|
struct ggml_tensor * r1;
|
||||||
|
struct ggml_tensor * r2;
|
||||||
|
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
||||||
|
int mode = -1;
|
||||||
|
|
||||||
|
if (m < 3) {
|
||||||
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||||
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||||
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
|
||||||
|
@ -156,19 +163,53 @@ int main(int /*argc*/, const char ** /*argv*/) {
|
||||||
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
|
||||||
((int32_t *) p2->data)[i] = n_past_2 + i;
|
((int32_t *) p2->data)[i] = n_past_2 + i;
|
||||||
}
|
}
|
||||||
|
|
||||||
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
|
||||||
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
mode = m == 0 ? 0 : m == 1 ? 2 : 4;
|
||||||
|
|
||||||
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
|
||||||
|
|
||||||
// 100, 101, 102, ..., 172
|
// 100, 101, 102, ..., 172
|
||||||
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
|
r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
|
||||||
// -67, -67, -67, ..., -67
|
// -67, -67, -67, ..., -67
|
||||||
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
|
r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
|
||||||
|
|
||||||
// 33, 34, 35, ..., 105
|
// 33, 34, 35, ..., 105
|
||||||
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
|
r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
|
||||||
|
} else {
|
||||||
|
// testing multi-dimension rope position embedding mode
|
||||||
|
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
||||||
|
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
||||||
|
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2] * 4);
|
||||||
|
|
||||||
|
int sections[4] = {16, 24, 24, 0};
|
||||||
|
mode = (m == 3) ? GGML_ROPE_TYPE_MROPE : GGML_ROPE_TYPE_VISION;
|
||||||
|
|
||||||
|
for (int i = 0; i < ne[2]; ++i) {
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
((int32_t *) p0->data)[i + ne[2] * j] = n_past_0 + i + j;
|
||||||
|
((int32_t *) p1->data)[i + ne[2] * j] = n_past_2 - n_past_0;
|
||||||
|
((int32_t *) p2->data)[i + ne[2] * j] = n_past_2 + i + j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// [[100, 101, 102, ..., 172],
|
||||||
|
// [101, 102, 103, ..., 173],
|
||||||
|
// [102, 103, 104, ..., 174]]
|
||||||
|
r0 = ggml_rope_multi(
|
||||||
|
ctx0, x, p0, nullptr,
|
||||||
|
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
||||||
|
// [[-67, -67, -67, ..., -67]
|
||||||
|
// [-67, -67, -67, ..., -67]
|
||||||
|
// [-67, -67, -67, ..., -67]]
|
||||||
|
r1 = ggml_rope_multi(
|
||||||
|
ctx0, r0, p1, nullptr,
|
||||||
|
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
||||||
|
|
||||||
|
// [[33, 34, 35, ..., 105]
|
||||||
|
// [34, 35, 36, ..., 106]
|
||||||
|
// [35, 36, 37, ..., 107]]
|
||||||
|
r2 = ggml_rope_multi(
|
||||||
|
ctx0, x, p2, nullptr,
|
||||||
|
n_rot, sections, mode, 32768, 1000000, 1, 0, 1, 32, 1);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue