Merge branch 'master' into hkvc_chaton_v3
Merge upstream as of 20240515IST11XY
This commit is contained in:
commit
7a3ac0cc15
76 changed files with 6731 additions and 2112 deletions
47
.github/workflows/build.yml
vendored
47
.github/workflows/build.yml
vendored
|
@ -340,6 +340,36 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose
|
ctest -L main --verbose
|
||||||
|
|
||||||
|
ubuntu-latest-cmake-rpc:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
continue-on-error: true
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: cmake_build
|
||||||
|
run: |
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -DLLAMA_RPC=ON ..
|
||||||
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
id: cmake_test
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
ctest -L main --verbose
|
||||||
|
|
||||||
ubuntu-22-cmake-vulkan:
|
ubuntu-22-cmake-vulkan:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
|
||||||
|
@ -663,6 +693,8 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- build: 'rpc'
|
||||||
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'noavx'
|
- build: 'noavx'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx2'
|
- build: 'avx2'
|
||||||
|
@ -898,9 +930,9 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
||||||
env:
|
env:
|
||||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
|
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
|
||||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
||||||
|
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
|
@ -932,6 +964,17 @@ jobs:
|
||||||
id: pack_artifacts
|
id: pack_artifacts
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
|
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||||
|
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||||
|
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||||
|
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||||
|
|
||||||
- name: Upload artifacts
|
- name: Upload artifacts
|
||||||
|
|
|
@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||||
|
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||||
|
@ -296,7 +297,7 @@ if (LLAMA_BLAS)
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
set(BLA_STATIC ON)
|
set(BLA_STATIC ON)
|
||||||
endif()
|
endif()
|
||||||
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
|
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
||||||
set(BLA_SIZEOF_INTEGER 8)
|
set(BLA_SIZEOF_INTEGER 8)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -494,6 +495,17 @@ if (LLAMA_MPI)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_RPC)
|
||||||
|
add_compile_definitions(GGML_USE_RPC)
|
||||||
|
|
||||||
|
if (WIN32)
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(GGML_HEADERS_RPC ggml-rpc.h)
|
||||||
|
set(GGML_SOURCES_RPC ggml-rpc.cpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CLBLAST)
|
if (LLAMA_CLBLAST)
|
||||||
find_package(CLBlast)
|
find_package(CLBlast)
|
||||||
if (CLBlast_FOUND)
|
if (CLBlast_FOUND)
|
||||||
|
@ -1178,6 +1190,7 @@ add_library(ggml OBJECT
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||||
|
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
||||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||||
|
@ -1283,17 +1296,6 @@ install(
|
||||||
WORLD_READ
|
WORLD_READ
|
||||||
WORLD_EXECUTE
|
WORLD_EXECUTE
|
||||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
install(
|
|
||||||
FILES convert-lora-to-ggml.py
|
|
||||||
PERMISSIONS
|
|
||||||
OWNER_READ
|
|
||||||
OWNER_WRITE
|
|
||||||
OWNER_EXECUTE
|
|
||||||
GROUP_READ
|
|
||||||
GROUP_EXECUTE
|
|
||||||
WORLD_READ
|
|
||||||
WORLD_EXECUTE
|
|
||||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
install(
|
install(
|
||||||
FILES ggml-metal.metal
|
FILES ggml-metal.metal
|
||||||
|
|
95
ci/run.sh
95
ci/run.sh
|
@ -365,47 +365,6 @@ function gg_run_open_llama_3b_v2 {
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
# lora
|
|
||||||
function compare_ppl {
|
|
||||||
qnt="$1"
|
|
||||||
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
path_lora="../models-mnt/open-llama/3B-v2/lora"
|
|
||||||
path_shakespeare="../models-mnt/shakespeare"
|
|
||||||
|
|
||||||
shakespeare="${path_shakespeare}/shakespeare.txt"
|
|
||||||
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
|
||||||
|
|
||||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
|
||||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
|
||||||
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
|
||||||
|
|
||||||
python3 ../convert-lora-to-ggml.py ${path_lora}
|
|
||||||
|
|
||||||
# f16
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
|
||||||
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
|
||||||
|
|
||||||
# q8_0
|
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
|
||||||
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
|
||||||
|
|
||||||
# q8_0 + f16 lora-base
|
|
||||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
|
||||||
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -416,7 +375,6 @@ function gg_sum_open_llama_3b_v2 {
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
@ -429,11 +387,6 @@ function gg_sum_open_llama_3b_v2 {
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
|
||||||
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
|
||||||
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
|
||||||
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
|
||||||
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_7b_v2
|
||||||
|
@ -549,48 +502,6 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||||
|
|
||||||
# lora
|
|
||||||
function compare_ppl {
|
|
||||||
qnt="$1"
|
|
||||||
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
|
||||||
|
|
||||||
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
|
||||||
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
|
||||||
return 20
|
|
||||||
fi
|
|
||||||
|
|
||||||
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
path_lora="../models-mnt/open-llama/7B-v2/lora"
|
|
||||||
path_shakespeare="../models-mnt/shakespeare"
|
|
||||||
|
|
||||||
shakespeare="${path_shakespeare}/shakespeare.txt"
|
|
||||||
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
|
||||||
|
|
||||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
|
||||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
|
||||||
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
|
||||||
|
|
||||||
python3 ../convert-lora-to-ggml.py ${path_lora}
|
|
||||||
|
|
||||||
# f16
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
|
||||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
|
||||||
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
|
||||||
|
|
||||||
# currently not supported by the CUDA backend
|
|
||||||
# q8_0
|
|
||||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
|
||||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
|
||||||
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
|
||||||
|
|
||||||
# q8_0 + f16 lora-base
|
|
||||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
|
||||||
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -601,7 +512,6 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
|
||||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||||
|
@ -614,11 +524,6 @@ function gg_sum_open_llama_7b_v2 {
|
||||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||||
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
|
||||||
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
|
||||||
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
|
||||||
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
|
||||||
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# bge-small
|
# bge-small
|
||||||
|
|
|
@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--interactive-specials") {
|
||||||
|
params.interactive_specials = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--embedding") {
|
if (arg == "--embedding") {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
return true;
|
return true;
|
||||||
|
@ -1078,6 +1082,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--rpc") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
params.rpc_servers = argv[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--no-mmap") {
|
if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
return true;
|
return true;
|
||||||
|
@ -1389,15 +1401,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
||||||
throw std::invalid_argument("error: unknown argument: " + arg);
|
throw std::invalid_argument("error: unknown argument: " + arg);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (invalid_param) {
|
if (invalid_param) {
|
||||||
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (params.prompt_cache_all &&
|
if (params.prompt_cache_all &&
|
||||||
(params.interactive || params.interactive_first ||
|
(params.interactive || params.interactive_first ||
|
||||||
|
@ -1444,6 +1454,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" --version show version and build info\n");
|
printf(" --version show version and build info\n");
|
||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
|
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
||||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
|
@ -1580,6 +1591,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||||
}
|
}
|
||||||
|
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||||
printf(" -gan N, --grp-attn-n N\n");
|
printf(" -gan N, --grp-attn-n N\n");
|
||||||
|
@ -1853,6 +1865,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||||
if (params.n_gpu_layers != -1) {
|
if (params.n_gpu_layers != -1) {
|
||||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
}
|
}
|
||||||
|
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||||
mparams.main_gpu = params.main_gpu;
|
mparams.main_gpu = params.main_gpu;
|
||||||
mparams.split_mode = params.split_mode;
|
mparams.split_mode = params.split_mode;
|
||||||
mparams.tensor_split = params.tensor_split;
|
mparams.tensor_split = params.tensor_split;
|
||||||
|
@ -2678,6 +2691,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
||||||
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
||||||
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
||||||
|
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
|
||||||
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
||||||
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
||||||
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
||||||
|
|
|
@ -82,6 +82,7 @@ struct gpt_params {
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
|
@ -140,6 +141,7 @@ struct gpt_params {
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||||
bool chaton = false; // whether chaton is enabled or disabled
|
bool chaton = false; // whether chaton is enabled or disabled
|
||||||
|
|
|
@ -142,6 +142,9 @@ namespace grammar_parser {
|
||||||
pos++;
|
pos++;
|
||||||
last_sym_start = out_elements.size();
|
last_sym_start = out_elements.size();
|
||||||
while (*pos != '"') {
|
while (*pos != '"') {
|
||||||
|
if (!*pos) {
|
||||||
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
}
|
||||||
auto char_pair = parse_char(pos);
|
auto char_pair = parse_char(pos);
|
||||||
pos = char_pair.second;
|
pos = char_pair.second;
|
||||||
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
||||||
|
@ -156,6 +159,9 @@ namespace grammar_parser {
|
||||||
}
|
}
|
||||||
last_sym_start = out_elements.size();
|
last_sym_start = out_elements.size();
|
||||||
while (*pos != ']') {
|
while (*pos != ']') {
|
||||||
|
if (!*pos) {
|
||||||
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
}
|
||||||
auto char_pair = parse_char(pos);
|
auto char_pair = parse_char(pos);
|
||||||
pos = char_pair.second;
|
pos = char_pair.second;
|
||||||
enum llama_gretype type = last_sym_start < out_elements.size()
|
enum llama_gretype type = last_sym_start < out_elements.size()
|
||||||
|
@ -164,6 +170,9 @@ namespace grammar_parser {
|
||||||
|
|
||||||
out_elements.push_back({type, char_pair.first});
|
out_elements.push_back({type, char_pair.first});
|
||||||
if (pos[0] == '-' && pos[1] != ']') {
|
if (pos[0] == '-' && pos[1] != ']') {
|
||||||
|
if (!pos[1]) {
|
||||||
|
throw std::runtime_error("unexpected end of input");
|
||||||
|
}
|
||||||
auto endchar_pair = parse_char(pos + 1);
|
auto endchar_pair = parse_char(pos + 1);
|
||||||
pos = endchar_pair.second;
|
pos = endchar_pair.second;
|
||||||
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
||||||
|
|
|
@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
result->prev.resize(params.n_prev);
|
||||||
|
|
||||||
result->n_considered = 0;
|
result->n_valid = 0;
|
||||||
|
|
||||||
llama_sampling_set_rng_seed(result, params.seed);
|
llama_sampling_set_rng_seed(result, params.seed);
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
|
|
||||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||||
ctx->cur.clear();
|
ctx->cur.clear();
|
||||||
ctx->n_considered = 0;
|
ctx->n_valid = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
||||||
|
@ -256,7 +256,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx_sampling->n_considered = cur_p.size;
|
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
|
||||||
|
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,7 +81,7 @@ struct llama_sampling_context {
|
||||||
// TODO: replace with ring-buffer
|
// TODO: replace with ring-buffer
|
||||||
std::vector<llama_token> prev;
|
std::vector<llama_token> prev;
|
||||||
std::vector<llama_token_data> cur;
|
std::vector<llama_token_data> cur;
|
||||||
size_t n_considered;
|
size_t n_valid; // Number of correct top tokens with correct probabilities.
|
||||||
|
|
||||||
std::mt19937 rng;
|
std::mt19937 rng;
|
||||||
};
|
};
|
||||||
|
|
|
@ -74,6 +74,9 @@ models = [
|
||||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||||
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||||
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||||
|
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||||
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# make directory "models/tokenizers" if it doesn't exist
|
# make directory "models/tokenizers" if it doesn't exist
|
||||||
|
@ -142,8 +145,17 @@ for model in models:
|
||||||
if tokt == TOKENIZER_TYPE.SPM:
|
if tokt == TOKENIZER_TYPE.SPM:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip if the tokenizer folder does not exist or there are other download issues previously
|
||||||
|
if not os.path.exists(f"models/tokenizers/{name}"):
|
||||||
|
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
# create the tokenizer
|
# create the tokenizer
|
||||||
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||||
|
continue # Skip to the next model if the tokenizer can't be loaded
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(chktxt)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
@ -161,6 +173,8 @@ for model in models:
|
||||||
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
||||||
pre_tokenizer = cfg["pre_tokenizer"]
|
pre_tokenizer = cfg["pre_tokenizer"]
|
||||||
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||||
|
if "ignore_merges" in cfg["model"]:
|
||||||
|
logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
|
||||||
|
|
||||||
logger.info("")
|
logger.info("")
|
||||||
|
|
||||||
|
@ -282,8 +296,17 @@ for model in models:
|
||||||
name = model["name"]
|
name = model["name"]
|
||||||
tokt = model["tokt"]
|
tokt = model["tokt"]
|
||||||
|
|
||||||
|
# Skip if the tokenizer folder does not exist or there are other download issues previously
|
||||||
|
if not os.path.exists(f"models/tokenizers/{name}"):
|
||||||
|
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
# create the tokenizer
|
# create the tokenizer
|
||||||
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||||
|
except OSError as e:
|
||||||
|
logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
|
||||||
|
continue # Skip this model and continue with the next one in the loop
|
||||||
|
|
||||||
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
|
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
|
||||||
for text in tests:
|
for text in tests:
|
||||||
|
|
|
@ -12,7 +12,7 @@ import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast, overload
|
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
@ -48,7 +48,6 @@ class Model:
|
||||||
|
|
||||||
dir_model: Path
|
dir_model: Path
|
||||||
ftype: int
|
ftype: int
|
||||||
fname_out: Path
|
|
||||||
is_big_endian: bool
|
is_big_endian: bool
|
||||||
endianess: gguf.GGUFEndian
|
endianess: gguf.GGUFEndian
|
||||||
use_temp_file: bool
|
use_temp_file: bool
|
||||||
|
@ -56,20 +55,20 @@ class Model:
|
||||||
part_names: list[str]
|
part_names: list[str]
|
||||||
is_safetensors: bool
|
is_safetensors: bool
|
||||||
hparams: dict[str, Any]
|
hparams: dict[str, Any]
|
||||||
gguf_writer: gguf.GGUFWriter
|
|
||||||
block_count: int
|
block_count: int
|
||||||
tensor_map: gguf.TensorNameMap
|
tensor_map: gguf.TensorNameMap
|
||||||
tensor_names: set[str] | None
|
tensor_names: set[str] | None
|
||||||
|
fname_out: Path
|
||||||
|
gguf_writer: gguf.GGUFWriter
|
||||||
|
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
|
||||||
if self.__class__ == Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{self.__class__.__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
self.dir_model = dir_model
|
self.dir_model = dir_model
|
||||||
self.ftype = ftype
|
self.ftype = ftype
|
||||||
self.fname_out = fname_out
|
|
||||||
self.is_big_endian = is_big_endian
|
self.is_big_endian = is_big_endian
|
||||||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||||
self.use_temp_file = use_temp_file
|
self.use_temp_file = use_temp_file
|
||||||
|
@ -79,10 +78,23 @@ class Model:
|
||||||
if not self.is_safetensors:
|
if not self.is_safetensors:
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
|
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||||
|
_, first_tensor = next(self.get_tensors())
|
||||||
|
if first_tensor.dtype == torch.float16:
|
||||||
|
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
|
||||||
|
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||||
|
else:
|
||||||
|
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
||||||
|
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
||||||
|
ftype_up: str = self.ftype.name.partition("_")[2].upper()
|
||||||
|
ftype_lw: str = ftype_up.lower()
|
||||||
|
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||||
|
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||||
|
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
|
@ -142,14 +154,27 @@ class Model:
|
||||||
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
|
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
|
||||||
|
|
||||||
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
||||||
name: str = gguf.TENSOR_NAMES[key]
|
|
||||||
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
||||||
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
|
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
|
||||||
|
name: str = gguf.TENSOR_NAMES[key]
|
||||||
if "{bid}" in name:
|
if "{bid}" in name:
|
||||||
assert bid is not None
|
assert bid is not None
|
||||||
name = name.format(bid=bid)
|
name = name.format(bid=bid)
|
||||||
return name + suffix
|
return name + suffix
|
||||||
|
|
||||||
|
def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
|
||||||
|
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
||||||
|
return False
|
||||||
|
key_name: str = gguf.TENSOR_NAMES[key]
|
||||||
|
if "{bid}" in key_name:
|
||||||
|
if bid is None:
|
||||||
|
return False
|
||||||
|
key_name = key_name.format(bid=bid)
|
||||||
|
else:
|
||||||
|
if bid is not None:
|
||||||
|
return False
|
||||||
|
return name == (key_name + suffix)
|
||||||
|
|
||||||
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
||||||
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
|
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
|
||||||
if new_name is None:
|
if new_name is None:
|
||||||
|
@ -239,35 +264,64 @@ class Model:
|
||||||
data: np.ndarray = data # type hint
|
data: np.ndarray = data # type hint
|
||||||
n_dims = len(data.shape)
|
n_dims = len(data.shape)
|
||||||
data_dtype = data.dtype
|
data_dtype = data.dtype
|
||||||
|
data_qtype: gguf.GGMLQuantizationType | None = None
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if self.ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# when both are True, f32 should win
|
# when both are True, f32 should win
|
||||||
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
||||||
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
|
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
|
||||||
|
|
||||||
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
||||||
extra_f32 = extra_f32 or n_dims == 1 or new_name.endswith("_norm.weight")
|
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
||||||
|
extra_f32 = any(cond for cond in (
|
||||||
|
extra_f32,
|
||||||
|
n_dims == 1,
|
||||||
|
new_name.endswith("_norm.weight"),
|
||||||
|
))
|
||||||
|
|
||||||
|
# Some tensor types are always in float32
|
||||||
|
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
|
||||||
|
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
gguf.MODEL_TENSOR.POS_EMBD,
|
||||||
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
))
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||||
extra_f16 = extra_f16 or (name.endswith(".weight") and n_dims >= 2)
|
extra_f16 = any(cond for cond in (
|
||||||
|
extra_f16,
|
||||||
|
(name.endswith(".weight") and n_dims >= 2),
|
||||||
|
))
|
||||||
|
|
||||||
# when both extra_f32 and extra_f16 are False, convert to float32 by default
|
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
||||||
if self.ftype == 1 and data_dtype == np.float16 and (extra_f32 or not extra_f16):
|
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||||
data = data.astype(np.float32)
|
data = gguf.quantize_bf16(data)
|
||||||
|
assert data.dtype == np.int16
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
|
|
||||||
if self.ftype == 1 and data_dtype == np.float32 and extra_f16 and not extra_f32:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
||||||
|
data = gguf.quantize_q8_0(data)
|
||||||
|
assert data.dtype == np.uint8
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
|
|
||||||
|
else: # default to float16 for quantized tensors
|
||||||
|
if data_dtype != np.float16:
|
||||||
data = data.astype(np.float16)
|
data = data.astype(np.float16)
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
|
||||||
|
if data_qtype is None: # by default, convert to float32
|
||||||
|
if data_dtype != np.float32:
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
|
||||||
# reverse shape to make it similar to the internal ggml dimension order
|
# reverse shape to make it similar to the internal ggml dimension order
|
||||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
|
shape_str = f"""{{{', '.join(str(n) for n in reversed(
|
||||||
|
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
|
||||||
|
)}}}"""
|
||||||
|
|
||||||
# n_dims is implicit in the shape
|
# n_dims is implicit in the shape
|
||||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data.dtype}, shape = {shape_str}")
|
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(new_name, data)
|
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
||||||
|
|
||||||
def write(self):
|
def write(self):
|
||||||
self.write_tensors()
|
self.write_tensors()
|
||||||
|
@ -404,8 +458,17 @@ class Model:
|
||||||
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
|
||||||
res = "olmo"
|
res = "olmo"
|
||||||
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
|
||||||
# ref: https://huggingface.co/databricks/dbrx-instruct
|
# ref: https://huggingface.co/databricks/dbrx-base
|
||||||
res = "dbrx"
|
res = "dbrx"
|
||||||
|
if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
|
||||||
|
res = "jina-v2-en"
|
||||||
|
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
|
||||||
|
res = "jina-v2-es"
|
||||||
|
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||||
|
res = "jina-v2-de"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -783,6 +846,7 @@ class BaichuanModel(Model):
|
||||||
self.gguf_writer.add_head_count(head_count)
|
self.gguf_writer.add_head_count(head_count)
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||||
|
@ -905,6 +969,7 @@ class XverseModel(Model):
|
||||||
self.gguf_writer.add_head_count(head_count)
|
self.gguf_writer.add_head_count(head_count)
|
||||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||||
|
@ -1013,6 +1078,18 @@ class StarCoderModel(Model):
|
||||||
class RefactModel(Model):
|
class RefactModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.REFACT
|
model_arch = gguf.MODEL_ARCH.REFACT
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
super().set_vocab()
|
||||||
|
|
||||||
|
# TODO: how to determine special FIM tokens automatically?
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
||||||
|
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
|
||||||
|
special_vocab._set_special_token("prefix", 1)
|
||||||
|
special_vocab._set_special_token("suffix", 3)
|
||||||
|
special_vocab._set_special_token("middle", 2)
|
||||||
|
special_vocab._set_special_token("fsep", 4) # is this correct?
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
hidden_dim = self.hparams["n_embd"]
|
hidden_dim = self.hparams["n_embd"]
|
||||||
inner_dim = 4 * hidden_dim
|
inner_dim = 4 * hidden_dim
|
||||||
|
@ -1127,6 +1204,7 @@ class StableLMModel(Model):
|
||||||
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
_q_norms: list[dict[str, Tensor]] | None = None
|
_q_norms: list[dict[str, Tensor]] | None = None
|
||||||
_k_norms: list[dict[str, Tensor]] | None = None
|
_k_norms: list[dict[str, Tensor]] | None = None
|
||||||
|
@ -1503,6 +1581,7 @@ class QwenModel(Model):
|
||||||
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("Qwen2ForCausalLM")
|
@Model.register("Qwen2ForCausalLM")
|
||||||
|
@ -1740,6 +1819,7 @@ class PlamoModel(Model):
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
|
self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
def shuffle_attn_q_weight(self, data_torch):
|
def shuffle_attn_q_weight(self, data_torch):
|
||||||
assert data_torch.size() == (5120, 5120)
|
assert data_torch.size() == (5120, 5120)
|
||||||
|
@ -1919,6 +1999,7 @@ in chat mode so that the conversation can end normally.")
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
num_heads = self.hparams["num_attention_heads"]
|
num_heads = self.hparams["num_attention_heads"]
|
||||||
|
@ -2023,12 +2104,6 @@ class BertModel(Model):
|
||||||
|
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
|
||||||
del new_name, bid, n_dims # unused
|
|
||||||
|
|
||||||
# not used with get_rows, must be F32
|
|
||||||
return name == "embeddings.token_type_embeddings.weight"
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("NomicBertModel")
|
@Model.register("NomicBertModel")
|
||||||
class NomicBertModel(BertModel):
|
class NomicBertModel(BertModel):
|
||||||
|
@ -2277,96 +2352,71 @@ class OlmoModel(Model):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
||||||
|
class JinaBertV2Model(BertModel):
|
||||||
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.intermediate_size = self.hparams["intermediate_size"]
|
||||||
|
|
||||||
|
def get_tensors(self):
|
||||||
|
for name, data in super().get_tensors():
|
||||||
|
if 'gated_layers' in name:
|
||||||
|
d1 = data[:self.intermediate_size, :]
|
||||||
|
name1 = name.replace('gated_layers', 'gated_layers_w')
|
||||||
|
d2 = data[self.intermediate_size:, :]
|
||||||
|
name2 = name.replace('gated_layers', 'gated_layers_v')
|
||||||
|
yield name1, d1
|
||||||
|
yield name2, d2
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield name, data
|
||||||
|
|
||||||
|
def set_vocab(self, *args, **kwargs):
|
||||||
|
tokenizer_class = 'BertTokenizer'
|
||||||
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_class = json.load(f)['tokenizer_class']
|
||||||
|
|
||||||
|
if tokenizer_class == 'BertTokenizer':
|
||||||
|
super().set_vocab()
|
||||||
|
elif tokenizer_class == 'RobertaTokenizer':
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
self.gguf_writer.add_token_type_count(2)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
|
||||||
|
self.gguf_writer.add_add_bos_token(True)
|
||||||
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
# tree of lazy tensors
|
# tree of lazy tensors
|
||||||
class LazyTorchTensor:
|
class LazyTorchTensor(gguf.LazyBase):
|
||||||
_meta: Tensor
|
_tensor_type = torch.Tensor
|
||||||
_data: Tensor | None
|
# to keep the type-checker happy
|
||||||
_args: tuple
|
dtype: torch.dtype
|
||||||
_func: Callable[[tuple], Tensor] | None
|
shape: torch.Size
|
||||||
|
|
||||||
def __init__(self, *, meta: Tensor, data: Tensor | None = None, args: tuple = (), func: Callable[[tuple], Tensor] | None = None):
|
|
||||||
self._meta = meta
|
|
||||||
self._data = data
|
|
||||||
self._args = args
|
|
||||||
self._func = func
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
|
|
||||||
# TODO: dict and set
|
|
||||||
if isinstance(o, (list, tuple)):
|
|
||||||
L = []
|
|
||||||
for item in o:
|
|
||||||
L.append(LazyTorchTensor._recurse_apply(item, fn))
|
|
||||||
if isinstance(o, tuple):
|
|
||||||
L = tuple(L)
|
|
||||||
return L
|
|
||||||
elif isinstance(o, LazyTorchTensor):
|
|
||||||
return fn(o)
|
|
||||||
else:
|
|
||||||
return o
|
|
||||||
|
|
||||||
def _wrap_fn(self, fn: Callable, use_self: bool = False) -> Callable[[Any], LazyTorchTensor]:
|
|
||||||
def wrapped_fn(*args, **kwargs):
|
|
||||||
if kwargs is None:
|
|
||||||
kwargs = {}
|
|
||||||
args = ((self,) if use_self else ()) + args
|
|
||||||
|
|
||||||
meta_args = LazyTorchTensor._recurse_apply(args, lambda t: t._meta)
|
|
||||||
|
|
||||||
return LazyTorchTensor(meta=fn(*meta_args, **kwargs), args=args, func=lambda a: fn(*a, **kwargs))
|
|
||||||
return wrapped_fn
|
|
||||||
|
|
||||||
def __getattr__(self, __name: str) -> Any:
|
|
||||||
meta_attr = getattr(self._meta, __name)
|
|
||||||
if callable(meta_attr):
|
|
||||||
return self._wrap_fn(getattr(torch.Tensor, __name), use_self=True)
|
|
||||||
elif isinstance(meta_attr, torch.Tensor):
|
|
||||||
# for things like self.T
|
|
||||||
return self._wrap_fn(lambda s: getattr(s, __name))(self)
|
|
||||||
else:
|
|
||||||
return meta_attr
|
|
||||||
|
|
||||||
|
# only used when converting a torch.Tensor to a np.ndarray
|
||||||
_dtype_map: dict[torch.dtype, type] = {
|
_dtype_map: dict[torch.dtype, type] = {
|
||||||
torch.float16: np.float16,
|
torch.float16: np.float16,
|
||||||
torch.float32: np.float32,
|
torch.float32: np.float32,
|
||||||
}
|
}
|
||||||
|
|
||||||
def numpy(self) -> gguf.LazyTensor:
|
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||||
dtype = self._dtype_map[self.dtype]
|
dtype = self._dtype_map[self.dtype]
|
||||||
return gguf.LazyTensor(lambda: LazyTorchTensor.to_eager(self).numpy(), dtype=dtype, shape=self.shape)
|
return gguf.LazyNumpyTensor(
|
||||||
|
meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
|
||||||
|
lazy=self._lazy,
|
||||||
|
args=(self,),
|
||||||
|
func=(lambda s: s[0].numpy())
|
||||||
|
)
|
||||||
|
|
||||||
@overload
|
@classmethod
|
||||||
@staticmethod
|
def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor:
|
||||||
def to_eager(t: Tensor | LazyTorchTensor) -> Tensor: ...
|
return torch.empty(size=shape, dtype=dtype, device="meta")
|
||||||
|
|
||||||
@overload
|
|
||||||
@staticmethod
|
|
||||||
def to_eager(t: tuple) -> tuple: ...
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def to_eager(t: Any) -> Any:
|
|
||||||
def simple_to_eager(_t: LazyTorchTensor) -> Tensor:
|
|
||||||
# wake up the lazy tensor
|
|
||||||
if _t._data is None and _t._func is not None:
|
|
||||||
# recurse into its arguments
|
|
||||||
_t._args = LazyTorchTensor.to_eager(_t._args)
|
|
||||||
_t._data = _t._func(_t._args)
|
|
||||||
if _t._data is not None:
|
|
||||||
return _t._data
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Could not compute lazy tensor {_t!r} with args {_t._args!r}")
|
|
||||||
|
|
||||||
# recurse into lists and/or tuples, keeping their structure
|
|
||||||
return LazyTorchTensor._recurse_apply(t, simple_to_eager)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_eager(t: Tensor) -> Tensor:
|
|
||||||
if (t.__class__ == LazyTorchTensor):
|
|
||||||
return t
|
|
||||||
return LazyTorchTensor(meta=t.detach().to("meta"), data=t) # type: ignore
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||||
|
@ -2377,28 +2427,8 @@ class LazyTorchTensor:
|
||||||
|
|
||||||
if func is torch.Tensor.numpy:
|
if func is torch.Tensor.numpy:
|
||||||
return args[0].numpy()
|
return args[0].numpy()
|
||||||
if func is torch.equal:
|
|
||||||
eager_args = LazyTorchTensor.to_eager(args)
|
|
||||||
return func(*eager_args, **kwargs)
|
|
||||||
|
|
||||||
return LazyTorchTensor._wrap_fn(args[0], func)(*args, **kwargs)
|
return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
|
||||||
|
|
||||||
# special methods bypass __getattr__, so they need to be added manually
|
|
||||||
# ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
|
|
||||||
# NOTE: LazyTorchTensor can't be a subclass of Tensor (and then be used
|
|
||||||
# as self._meta is currently used), because then the following
|
|
||||||
# operations would by default not be wrapped, and so not propagated
|
|
||||||
# when the tensor is made eager.
|
|
||||||
# It's better to get non-silent errors for not-yet-supported operators.
|
|
||||||
# TODO: add more when needed to avoid clutter, or find a more concise way
|
|
||||||
def __neg__(self, *args): # mamba
|
|
||||||
return self._wrap_fn(torch.Tensor.__neg__)(self, *args)
|
|
||||||
|
|
||||||
def __add__(self, *args): # gemma
|
|
||||||
return self._wrap_fn(torch.Tensor.__add__)(self, *args)
|
|
||||||
|
|
||||||
def __getitem__(self, *args): # bloom falcon refact internlm2
|
|
||||||
return self._wrap_fn(torch.Tensor.__getitem__)(self, *args)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
|
@ -2414,11 +2444,11 @@ def parse_args() -> argparse.Namespace:
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outfile", type=Path,
|
"--outfile", type=Path,
|
||||||
help="path to write to; default: based on input",
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outtype", type=str, choices=["f32", "f16"], default="f16",
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
||||||
help="output format - use f32 for float32, f16 for float16",
|
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bigendian", action="store_true",
|
"--bigendian", action="store_true",
|
||||||
|
@ -2472,16 +2502,19 @@ def main() -> None:
|
||||||
logger.error(f'Error: {args.model} is not a directory')
|
logger.error(f'Error: {args.model} is not a directory')
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
ftype_map = {
|
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||||
"f32": gguf.GGMLQuantizationType.F32,
|
"f32": gguf.LlamaFileType.ALL_F32,
|
||||||
"f16": gguf.GGMLQuantizationType.F16,
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
# output in the same directory as the model by default
|
# output in the same directory as the model by default
|
||||||
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
|
fname_out = dir_model / 'ggml-model-{ftype}.gguf'
|
||||||
|
|
||||||
logger.info(f"Loading model: {dir_model.name}")
|
logger.info(f"Loading model: {dir_model.name}")
|
||||||
|
|
||||||
|
@ -2497,14 +2530,16 @@ def main() -> None:
|
||||||
logger.info("Set model tokenizer")
|
logger.info("Set model tokenizer")
|
||||||
model_instance.set_vocab()
|
model_instance.set_vocab()
|
||||||
|
|
||||||
|
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
logger.info(f"Exporting model vocab to '{fname_out}'")
|
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
else:
|
else:
|
||||||
logger.info(f"Exporting model to '{fname_out}'")
|
logger.info(f"Exporting model to '{model_instance.fname_out}'")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
|
|
||||||
logger.info(f"Model successfully exported to '{fname_out}'")
|
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -1,150 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, BinaryIO, Sequence
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
|
||||||
logger = logging.getLogger("lora-to-gguf")
|
|
||||||
|
|
||||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
|
||||||
|
|
||||||
|
|
||||||
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
|
||||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
|
||||||
fout.write(struct.pack("i", 1)) # file version
|
|
||||||
fout.write(struct.pack("i", params["r"]))
|
|
||||||
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
|
||||||
# but some models ship a float value instead
|
|
||||||
# let's convert to int, but fail if lossless conversion is not possible
|
|
||||||
assert (
|
|
||||||
int(params["lora_alpha"]) == params["lora_alpha"]
|
|
||||||
), "cannot convert float to int losslessly"
|
|
||||||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
|
||||||
|
|
||||||
|
|
||||||
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
|
|
||||||
sname = name.encode("utf-8")
|
|
||||||
fout.write(
|
|
||||||
struct.pack(
|
|
||||||
"iii",
|
|
||||||
len(shape),
|
|
||||||
len(sname),
|
|
||||||
NUMPY_TYPE_TO_FTYPE[data_type.name],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
|
||||||
fout.write(sname)
|
|
||||||
fout.seek((fout.tell() + 31) & -32)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
|
|
||||||
logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
|
|
||||||
logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
|
||||||
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
|
||||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
|
||||||
|
|
||||||
if os.path.exists(input_model):
|
|
||||||
model = torch.load(input_model, map_location="cpu")
|
|
||||||
else:
|
|
||||||
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
|
|
||||||
# lazy import load_file only if lora is in safetensors format.
|
|
||||||
from safetensors.torch import load_file
|
|
||||||
model = load_file(input_model, device="cpu")
|
|
||||||
|
|
||||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
|
||||||
|
|
||||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
|
||||||
logger.error(f"Error: unsupported architecture {arch_name}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
|
||||||
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
|
||||||
|
|
||||||
with open(input_json, "r") as f:
|
|
||||||
params = json.load(f)
|
|
||||||
|
|
||||||
if params["peft_type"] != "LORA":
|
|
||||||
logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if params["fan_in_fan_out"] is True:
|
|
||||||
logger.error("Error: param fan_in_fan_out is not supported")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if params["bias"] is not None and params["bias"] != "none":
|
|
||||||
logger.error("Error: param bias is not supported")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# TODO: these seem to be layers that have been trained but without lora.
|
|
||||||
# doesn't seem widely used but eventually should be supported
|
|
||||||
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
|
|
||||||
logger.error("Error: param modules_to_save is not supported")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
with open(output_path, "wb") as fout:
|
|
||||||
fout.truncate()
|
|
||||||
|
|
||||||
write_file_header(fout, params)
|
|
||||||
for k, v in model.items():
|
|
||||||
orig_k = k
|
|
||||||
if k.endswith(".default.weight"):
|
|
||||||
k = k.replace(".default.weight", ".weight")
|
|
||||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
|
||||||
continue
|
|
||||||
if k.endswith("lora_A.weight"):
|
|
||||||
if v.dtype != torch.float16 and v.dtype != torch.float32:
|
|
||||||
v = v.float()
|
|
||||||
v = v.T
|
|
||||||
else:
|
|
||||||
v = v.float()
|
|
||||||
|
|
||||||
t = v.detach().numpy()
|
|
||||||
|
|
||||||
prefix = "base_model.model."
|
|
||||||
if k.startswith(prefix):
|
|
||||||
k = k[len(prefix) :]
|
|
||||||
|
|
||||||
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
|
||||||
if k.endswith(lora_suffixes):
|
|
||||||
suffix = k[-len(lora_suffixes[0]):]
|
|
||||||
k = k[: -len(lora_suffixes[0])]
|
|
||||||
else:
|
|
||||||
logger.error(f"Error: unrecognized tensor name {orig_k}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
tname = name_map.get_name(k)
|
|
||||||
if tname is None:
|
|
||||||
logger.error(f"Error: could not map tensor name {orig_k}")
|
|
||||||
logger.error(" Note: the arch parameter must be specified if the model is not llama")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if suffix == ".lora_A.weight":
|
|
||||||
tname += ".weight.loraA"
|
|
||||||
elif suffix == ".lora_B.weight":
|
|
||||||
tname += ".weight.loraB"
|
|
||||||
else:
|
|
||||||
assert False
|
|
||||||
|
|
||||||
logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
|
||||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
|
||||||
t.tofile(fout)
|
|
||||||
|
|
||||||
logger.info(f"Converted {input_json} and {input_model} to {output_path}")
|
|
178
convert.py
178
convert.py
|
@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
|
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
@ -344,10 +344,47 @@ class Params:
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Metadata:
|
||||||
|
name: Optional[str] = None
|
||||||
|
author: Optional[str] = None
|
||||||
|
version: Optional[str] = None
|
||||||
|
url: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
licence: Optional[str] = None
|
||||||
|
source_url: Optional[str] = None
|
||||||
|
source_hf_repo: Optional[str] = None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(metadata_path: Path) -> Metadata:
|
||||||
|
if metadata_path is None or not metadata_path.exists():
|
||||||
|
return Metadata()
|
||||||
|
|
||||||
|
with open(metadata_path, 'r') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
|
||||||
|
# Create a new Metadata instance
|
||||||
|
metadata = Metadata()
|
||||||
|
|
||||||
|
# Assigning values to Metadata attributes if they exist in the JSON file
|
||||||
|
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
||||||
|
metadata.name = data.get("general.name")
|
||||||
|
metadata.author = data.get("general.author")
|
||||||
|
metadata.version = data.get("general.version")
|
||||||
|
metadata.url = data.get("general.url")
|
||||||
|
metadata.description = data.get("general.description")
|
||||||
|
metadata.license = data.get("general.license")
|
||||||
|
metadata.source_url = data.get("general.source.url")
|
||||||
|
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
|
||||||
|
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# vocab
|
# vocab
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
@runtime_checkable
|
||||||
class BaseVocab(Protocol):
|
class BaseVocab(Protocol):
|
||||||
tokenizer_model: ClassVar[str]
|
tokenizer_model: ClassVar[str]
|
||||||
|
@ -1066,21 +1103,42 @@ class OutputFile:
|
||||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_model(self, params: Params, metadata: Metadata) -> None:
|
||||||
|
# Metadata About The Model And Its Provenence
|
||||||
name = "LLaMA"
|
name = "LLaMA"
|
||||||
|
if metadata is not None and metadata.name is not None:
|
||||||
# TODO: better logic to determine model name
|
name = metadata.name
|
||||||
if params.n_ctx == 4096:
|
|
||||||
name = "LLaMA v2"
|
|
||||||
elif params.path_model is not None:
|
elif params.path_model is not None:
|
||||||
name = str(params.path_model.parent).split('/')[-1]
|
name = str(params.path_model.parent).split("/")[-1]
|
||||||
|
elif params.n_ctx == 4096:
|
||||||
|
# Heuristic detection of LLaMA v2 model
|
||||||
|
name = "LLaMA v2"
|
||||||
|
|
||||||
self.gguf.add_name (name)
|
self.gguf.add_name(name)
|
||||||
self.gguf.add_vocab_size (params.n_vocab)
|
|
||||||
self.gguf.add_context_length (params.n_ctx)
|
if metadata is not None:
|
||||||
self.gguf.add_embedding_length (params.n_embd)
|
if metadata.author is not None:
|
||||||
self.gguf.add_block_count (params.n_layer)
|
self.gguf.add_author(metadata.author)
|
||||||
self.gguf.add_feed_forward_length (params.n_ff)
|
if metadata.version is not None:
|
||||||
|
self.gguf.add_version(metadata.version)
|
||||||
|
if metadata.url is not None:
|
||||||
|
self.gguf.add_url(metadata.url)
|
||||||
|
if metadata.description is not None:
|
||||||
|
self.gguf.add_description(metadata.description)
|
||||||
|
if metadata.licence is not None:
|
||||||
|
self.gguf.add_licence(metadata.licence)
|
||||||
|
if metadata.source_url is not None:
|
||||||
|
self.gguf.add_source_url(metadata.source_url)
|
||||||
|
if metadata.source_hf_repo is not None:
|
||||||
|
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
|
||||||
|
|
||||||
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
|
# Metadata About The Neural Architecture Itself
|
||||||
|
self.gguf.add_vocab_size(params.n_vocab)
|
||||||
|
self.gguf.add_context_length(params.n_ctx)
|
||||||
|
self.gguf.add_embedding_length(params.n_embd)
|
||||||
|
self.gguf.add_block_count(params.n_layer)
|
||||||
|
self.gguf.add_feed_forward_length(params.n_ff)
|
||||||
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
||||||
self.gguf.add_head_count (params.n_head)
|
self.gguf.add_head_count (params.n_head)
|
||||||
self.gguf.add_head_count_kv (params.n_head_kv)
|
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||||
|
@ -1183,13 +1241,14 @@ class OutputFile:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(
|
def write_vocab_only(
|
||||||
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
# meta data
|
# meta data
|
||||||
|
of.add_meta_model(params, metadata)
|
||||||
of.add_meta_arch(params)
|
of.add_meta_arch(params)
|
||||||
of.add_meta_vocab(vocab)
|
of.add_meta_vocab(vocab)
|
||||||
of.add_meta_special_vocab(svocab)
|
of.add_meta_special_vocab(svocab)
|
||||||
|
@ -1216,12 +1275,14 @@ class OutputFile:
|
||||||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
||||||
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
pad_vocab: bool = False,
|
pad_vocab: bool = False,
|
||||||
|
metadata: Metadata = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
# meta data
|
# meta data
|
||||||
|
of.add_meta_model(params, metadata)
|
||||||
of.add_meta_arch(params)
|
of.add_meta_arch(params)
|
||||||
if isinstance(vocab, Vocab):
|
if isinstance(vocab, Vocab):
|
||||||
of.add_meta_vocab(vocab)
|
of.add_meta_vocab(vocab)
|
||||||
|
@ -1257,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
||||||
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
|
def model_parameter_count(model: LazyModel) -> int:
|
||||||
|
total_model_parameters = 0
|
||||||
|
for i, (name, lazy_tensor) in enumerate(model.items()):
|
||||||
|
sum_weights_in_tensor = 1
|
||||||
|
for dim in lazy_tensor.shape:
|
||||||
|
sum_weights_in_tensor *= dim
|
||||||
|
total_model_parameters += sum_weights_in_tensor
|
||||||
|
return total_model_parameters
|
||||||
|
|
||||||
|
|
||||||
|
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
|
||||||
|
if model_params_count > 1e12 :
|
||||||
|
# Trillions Of Parameters
|
||||||
|
scaled_model_params = model_params_count * 1e-12
|
||||||
|
scale_suffix = "T"
|
||||||
|
elif model_params_count > 1e9 :
|
||||||
|
# Billions Of Parameters
|
||||||
|
scaled_model_params = model_params_count * 1e-9
|
||||||
|
scale_suffix = "B"
|
||||||
|
elif model_params_count > 1e6 :
|
||||||
|
# Millions Of Parameters
|
||||||
|
scaled_model_params = model_params_count * 1e-6
|
||||||
|
scale_suffix = "M"
|
||||||
|
else:
|
||||||
|
# Thousands Of Parameters
|
||||||
|
scaled_model_params = model_params_count * 1e-3
|
||||||
|
scale_suffix = "K"
|
||||||
|
|
||||||
|
return f"{round(scaled_model_params)}{scale_suffix}"
|
||||||
|
|
||||||
|
|
||||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||||
for (name, tensor) in model.items()}
|
for (name, tensor) in model.items()}
|
||||||
|
@ -1436,13 +1528,35 @@ class VocabFactory:
|
||||||
return vocab, special_vocab
|
return vocab, special_vocab
|
||||||
|
|
||||||
|
|
||||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
|
||||||
namestr = {
|
quantization = {
|
||||||
GGMLFileType.AllF32: "f32",
|
GGMLFileType.AllF32: "F32",
|
||||||
GGMLFileType.MostlyF16: "f16",
|
GGMLFileType.MostlyF16: "F16",
|
||||||
GGMLFileType.MostlyQ8_0:"q8_0",
|
GGMLFileType.MostlyQ8_0: "Q8_0",
|
||||||
}[file_type]
|
}[file_type]
|
||||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
|
||||||
|
parameters = model_parameter_count_rounded_notation(model_params_count)
|
||||||
|
|
||||||
|
expert_count = ""
|
||||||
|
if params.n_experts is not None:
|
||||||
|
expert_count = f"{params.n_experts}x"
|
||||||
|
|
||||||
|
version = ""
|
||||||
|
if metadata is not None and metadata.version is not None:
|
||||||
|
version = f"-{metadata.version}"
|
||||||
|
|
||||||
|
name = "ggml-model"
|
||||||
|
if metadata is not None and metadata.name is not None:
|
||||||
|
name = metadata.name
|
||||||
|
elif params.path_model is not None:
|
||||||
|
name = params.path_model.name
|
||||||
|
|
||||||
|
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
|
||||||
|
|
||||||
|
|
||||||
|
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
|
||||||
|
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
|
||||||
|
ret = model_paths[0].parent / f"{default_filename}.gguf"
|
||||||
if ret in model_paths:
|
if ret in model_paths:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Error: Default output path ({ret}) would overwrite the input. "
|
f"Error: Default output path ({ret}) would overwrite the input. "
|
||||||
|
@ -1480,17 +1594,30 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||||
|
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
|
||||||
|
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
|
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
elif args.dump_single or args.dump:
|
elif args.dump_single or args.dump or args.get_outfile:
|
||||||
# Avoid printing anything besides the dump output
|
# Avoid printing anything besides the dump output
|
||||||
logging.basicConfig(level=logging.WARNING)
|
logging.basicConfig(level=logging.WARNING)
|
||||||
else:
|
else:
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
metadata = Metadata.load(args.metadata)
|
||||||
|
|
||||||
|
if args.get_outfile:
|
||||||
|
model_plus = load_some_model(args.model)
|
||||||
|
params = Params.load(model_plus)
|
||||||
|
model = convert_model_names(model_plus.model, params, args.skip_unknown)
|
||||||
|
model_params_count = model_parameter_count(model_plus.model)
|
||||||
|
ftype = pick_output_type(model, args.outtype)
|
||||||
|
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
|
||||||
|
return
|
||||||
|
|
||||||
if args.no_vocab and args.vocab_only:
|
if args.no_vocab and args.vocab_only:
|
||||||
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
||||||
|
|
||||||
|
@ -1504,6 +1631,9 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
else:
|
else:
|
||||||
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
||||||
|
|
||||||
|
model_params_count = model_parameter_count(model_plus.model)
|
||||||
|
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
|
||||||
|
|
||||||
if args.dump:
|
if args.dump:
|
||||||
do_dump_model(model_plus)
|
do_dump_model(model_plus)
|
||||||
return
|
return
|
||||||
|
@ -1557,7 +1687,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
f_norm_eps = 1e-5,
|
f_norm_eps = 1e-5,
|
||||||
)
|
)
|
||||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||||
endianess=endianess, pad_vocab=args.pad_vocab)
|
endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
|
||||||
logger.info(f"Wrote {outfile}")
|
logger.info(f"Wrote {outfile}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -1570,13 +1700,13 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
model = convert_model_names(model, params, args.skip_unknown)
|
model = convert_model_names(model, params, args.skip_unknown)
|
||||||
ftype = pick_output_type(model, args.outtype)
|
ftype = pick_output_type(model, args.outtype)
|
||||||
model = convert_to_output_type(model, ftype)
|
model = convert_to_output_type(model, ftype)
|
||||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
|
||||||
|
|
||||||
params.ftype = ftype
|
params.ftype = ftype
|
||||||
logger.info(f"Writing {outfile}, format {ftype}")
|
logger.info(f"Writing {outfile}, format {ftype}")
|
||||||
|
|
||||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||||
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
|
||||||
logger.info(f"Wrote {outfile}")
|
logger.info(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
|
|
88
docs/debugging-tests.md
Normal file
88
docs/debugging-tests.md
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
# Debugging Tests Tips
|
||||||
|
|
||||||
|
## How to run & debug a specific test without anything else to keep the feedback loop short?
|
||||||
|
|
||||||
|
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
|
||||||
|
|
||||||
|
For example, running the following command will output an interactive list from which you can select a test. It takes this form:
|
||||||
|
|
||||||
|
`debug-test.sh [OPTION]... <test_regex> <test_number>`
|
||||||
|
|
||||||
|
It will then build & run in the debugger for you.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./scripts/debug-test.sh test-tokenizer
|
||||||
|
|
||||||
|
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
|
||||||
|
>>> b main
|
||||||
|
```
|
||||||
|
|
||||||
|
For further reference use `debug-test.sh -h` to print help.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### How does the script work?
|
||||||
|
If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
|
||||||
|
|
||||||
|
#### Step 1: Reset and Setup folder context
|
||||||
|
|
||||||
|
From base of this repository, let's create `build-ci-debug` as our build context.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 2: Setup Build Environment and Compile Test Binaries
|
||||||
|
|
||||||
|
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
|
||||||
|
make -j
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 3.1: Identify Test Command for Debugging
|
||||||
|
|
||||||
|
The output of this command will give you the command & arguments needed to run GDB.
|
||||||
|
|
||||||
|
* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
|
||||||
|
* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
|
||||||
|
* `-V` : Verbose Mode
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ctest -R "test-tokenizer" -V -N
|
||||||
|
```
|
||||||
|
|
||||||
|
This may return output similar to below (focusing on key lines to pay attention to):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
...
|
||||||
|
1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
|
||||||
|
1: Working Directory: .
|
||||||
|
Labels: main
|
||||||
|
Test #1: test-tokenizer-0-llama-spm
|
||||||
|
...
|
||||||
|
4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
|
||||||
|
4: Working Directory: .
|
||||||
|
Labels: main
|
||||||
|
Test #4: test-tokenizer-0-falcon
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
So for test #1 we can tell these two pieces of relevant information:
|
||||||
|
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||||
|
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
||||||
|
|
||||||
|
#### Step 3.2: Run GDB on test command
|
||||||
|
|
||||||
|
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
gdb --args ${Test Binary} ${Test GGUF Model}
|
||||||
|
```
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
|
||||||
|
```
|
|
@ -49,4 +49,7 @@ else()
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(export-lora)
|
||||||
|
if (LLAMA_RPC)
|
||||||
|
add_subdirectory(rpc)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
}
|
}
|
||||||
|
|
||||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||||
|
//TODO: I would also add a parameter here to enable normalization or not.
|
||||||
|
/*fprintf(stdout, "unnormalized_embedding:");
|
||||||
|
for (int hh = 0; hh < n_embd; hh++) {
|
||||||
|
fprintf(stdout, "%9.6f ", embd[hh]);
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n");*/
|
||||||
llama_embd_normalize(embd, out, n_embd);
|
llama_embd_normalize(embd, out, n_embd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
|
||||||
inputs.push_back(inp);
|
inputs.push_back(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add SEP if not present
|
// check if the last token is SEP
|
||||||
|
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
||||||
for (auto & inp : inputs) {
|
for (auto & inp : inputs) {
|
||||||
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
||||||
inp.push_back(llama_token_sep(model));
|
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
|
||||||
|
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -203,6 +211,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
llama_batch_free(batch);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
|
@ -26,16 +26,21 @@ options:
|
||||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||||
-p, --n-prompt <n> (default: 512)
|
-p, --n-prompt <n> (default: 512)
|
||||||
-n, --n-gen <n> (default: 128)
|
-n, --n-gen <n> (default: 128)
|
||||||
-b, --batch-size <n> (default: 512)
|
-pg <pp,tg> (default: 512,128)
|
||||||
-ctk <t>, --cache-type-k <t> (default: f16)
|
-b, --batch-size <n> (default: 2048)
|
||||||
-ctv <t>, --cache-type-v <t> (default: f16)
|
-ub, --ubatch-size <n> (default: 512)
|
||||||
-t, --threads <n> (default: 112)
|
-ctk, --cache-type-k <t> (default: f16)
|
||||||
|
-ctv, --cache-type-v <t> (default: f16)
|
||||||
|
-t, --threads <n> (default: 16)
|
||||||
-ngl, --n-gpu-layers <n> (default: 99)
|
-ngl, --n-gpu-layers <n> (default: 99)
|
||||||
-sm, --split-mode <none|layer|row> (default: layer)
|
-sm, --split-mode <none|layer|row> (default: layer)
|
||||||
-mg, --main-gpu <i> (default: 0)
|
-mg, --main-gpu <i> (default: 0)
|
||||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||||
|
-fa, --flash-attn <0|1> (default: 0)
|
||||||
-mmp, --mmap <0|1> (default: 1)
|
-mmp, --mmap <0|1> (default: 1)
|
||||||
-ts, --tensor_split <ts0/ts1/..> (default: 0)
|
--numa <distribute|isolate|numactl> (default: disabled)
|
||||||
|
-embd, --embeddings <0|1> (default: 0)
|
||||||
|
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
||||||
-r, --repetitions <n> (default: 5)
|
-r, --repetitions <n> (default: 5)
|
||||||
-o, --output <csv|json|md|sql> (default: md)
|
-o, --output <csv|json|md|sql> (default: md)
|
||||||
-v, --verbose (default: 0)
|
-v, --verbose (default: 0)
|
||||||
|
@ -43,10 +48,11 @@ options:
|
||||||
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
||||||
```
|
```
|
||||||
|
|
||||||
llama-bench can perform two types of tests:
|
llama-bench can perform three types of tests:
|
||||||
|
|
||||||
- Prompt processing (pp): processing a prompt in batches (`-p`)
|
- Prompt processing (pp): processing a prompt in batches (`-p`)
|
||||||
- Text generation (tg): generating a sequence of tokens (`-n`)
|
- Text generation (tg): generating a sequence of tokens (`-n`)
|
||||||
|
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)
|
||||||
|
|
||||||
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
|
With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).
|
||||||
|
|
||||||
|
|
|
@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string pair_str(const std::pair<int, int> & p) {
|
||||||
|
static char buf[32];
|
||||||
|
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
struct cmd_params {
|
struct cmd_params {
|
||||||
std::vector<std::string> model;
|
std::vector<std::string> model;
|
||||||
std::vector<int> n_prompt;
|
std::vector<int> n_prompt;
|
||||||
std::vector<int> n_gen;
|
std::vector<int> n_gen;
|
||||||
|
std::vector<std::pair<int, int>> n_pg;
|
||||||
std::vector<int> n_batch;
|
std::vector<int> n_batch;
|
||||||
std::vector<int> n_ubatch;
|
std::vector<int> n_ubatch;
|
||||||
std::vector<ggml_type> type_k;
|
std::vector<ggml_type> type_k;
|
||||||
|
@ -188,6 +195,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
||||||
/* n_prompt */ {512},
|
/* n_prompt */ {512},
|
||||||
/* n_gen */ {128},
|
/* n_gen */ {128},
|
||||||
|
/* n_pg */ {{512, 128}},
|
||||||
/* n_batch */ {2048},
|
/* n_batch */ {2048},
|
||||||
/* n_ubatch */ {512},
|
/* n_ubatch */ {512},
|
||||||
/* type_k */ {GGML_TYPE_F16},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
|
@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||||
|
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||||
printf(" -ub N, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
||||||
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||||
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
|
@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
auto p = split<int>(argv[i], split_delim);
|
auto p = split<int>(argv[i], split_delim);
|
||||||
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "-pg") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = split<std::string>(argv[i], ',');
|
||||||
|
if (p.size() != 2) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
|
||||||
} else if (arg == "-b" || arg == "--batch-size") {
|
} else if (arg == "-b" || arg == "--batch-size") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
|
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
|
||||||
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
||||||
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
||||||
|
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
|
||||||
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
||||||
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
|
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
|
||||||
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
||||||
|
@ -632,6 +653,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
};
|
};
|
||||||
instances.push_back(instance);
|
instances.push_back(instance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (const auto & n_pg : params.n_pg) {
|
||||||
|
if (n_pg.first == 0 && n_pg.second == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
cmd_params_instance instance = {
|
||||||
|
/* .model = */ m,
|
||||||
|
/* .n_prompt = */ n_pg.first,
|
||||||
|
/* .n_gen = */ n_pg.second,
|
||||||
|
/* .n_batch = */ nb,
|
||||||
|
/* .n_ubatch = */ nub,
|
||||||
|
/* .type_k = */ tk,
|
||||||
|
/* .type_v = */ tv,
|
||||||
|
/* .n_threads = */ nt,
|
||||||
|
/* .n_gpu_layers = */ nl,
|
||||||
|
/* .split_mode = */ sm,
|
||||||
|
/* .main_gpu = */ mg,
|
||||||
|
/* .no_kv_offload= */ nkvo,
|
||||||
|
/* .flash_attn = */ fa,
|
||||||
|
/* .tensor_split = */ ts,
|
||||||
|
/* .use_mmap = */ mmp,
|
||||||
|
/* .embeddings = */ embd,
|
||||||
|
};
|
||||||
|
instances.push_back(instance);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return instances;
|
return instances;
|
||||||
|
@ -965,6 +1011,9 @@ struct markdown_printer : public printer {
|
||||||
if (field == "n_gpu_layers") {
|
if (field == "n_gpu_layers") {
|
||||||
return 3;
|
return 3;
|
||||||
}
|
}
|
||||||
|
if (field == "test") {
|
||||||
|
return 13;
|
||||||
|
}
|
||||||
|
|
||||||
int width = std::max((int)field.length(), 10);
|
int width = std::max((int)field.length(), 10);
|
||||||
|
|
||||||
|
@ -1091,12 +1140,11 @@ struct markdown_printer : public printer {
|
||||||
value = test::get_backend();
|
value = test::get_backend();
|
||||||
} else if (field == "test") {
|
} else if (field == "test") {
|
||||||
if (t.n_prompt > 0 && t.n_gen == 0) {
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
||||||
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
|
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
||||||
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
||||||
snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
|
snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
|
||||||
} else {
|
} else {
|
||||||
assert(false);
|
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
||||||
exit(1);
|
|
||||||
}
|
}
|
||||||
value = buf;
|
value = buf;
|
||||||
} else if (field == "t/s") {
|
} else if (field == "t/s") {
|
||||||
|
@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
uint64_t t_start = get_time_ns();
|
uint64_t t_start = get_time_ns();
|
||||||
|
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
||||||
}
|
}
|
||||||
|
|
|
@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||||
|
if (!ctx_sampling) {
|
||||||
|
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
std::string response = "";
|
std::string response = "";
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||||
|
@ -295,6 +300,19 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (prompt_contains_image(params.prompt)) {
|
||||||
|
auto ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
|
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
||||||
|
|
||||||
|
// process the prompt
|
||||||
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
|
llama_print_timings(ctx_llava->ctx_llama);
|
||||||
|
llava_image_embed_free(image_embed);
|
||||||
|
ctx_llava->model = NULL;
|
||||||
|
llava_free(ctx_llava);
|
||||||
|
} else {
|
||||||
for (auto & image : params.image) {
|
for (auto & image : params.image) {
|
||||||
auto ctx_llava = llava_init_context(¶ms, model);
|
auto ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
|
@ -312,6 +330,8 @@ int main(int argc, char ** argv) {
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
||||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||||
struct {
|
struct {
|
||||||
struct ggml_tensor * newline;
|
|
||||||
struct ggml_context * ctx;
|
struct ggml_context * ctx;
|
||||||
} model;
|
} model;
|
||||||
|
|
||||||
|
@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||||
|
|
||||||
model.ctx = ggml_init(params);
|
model.ctx = ggml_init(params);
|
||||||
|
|
||||||
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
|
||||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
|
||||||
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
|
||||||
if (newline_tmp->buffer == NULL) {
|
|
||||||
LOG_TEE("newline_tmp tensor buffer is NULL\n");
|
|
||||||
}
|
|
||||||
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
|
|
||||||
} else {
|
|
||||||
model.newline->data = newline_tmp->data;
|
|
||||||
if (model.newline->data == NULL) {
|
|
||||||
LOG_TEE("newline_tmp tensor data is NULL\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
||||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||||
// fill it with the image embeddings, ignoring the base
|
// fill it with the image embeddings, ignoring the base
|
||||||
|
|
|
@ -546,6 +546,10 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||||
|
if (!ctx_sampling) {
|
||||||
|
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
|
@ -902,7 +906,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
|
||||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||||
|
|
|
@ -7,6 +7,8 @@ Also note that finetunes typically result in a higher perplexity value even thou
|
||||||
|
|
||||||
Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
|
Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
|
||||||
The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
|
The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
|
||||||
|
When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
|
||||||
|
llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
|
||||||
|
|
||||||
By default only the mean perplexity value and the corresponding uncertainty is calculated.
|
By default only the mean perplexity value and the corresponding uncertainty is calculated.
|
||||||
The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
|
The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
|
||||||
|
@ -32,7 +34,13 @@ In addition to the KL divergence the following statistics are calculated with `-
|
||||||
|
|
||||||
## LLaMA 3 8b Scoreboard
|
## LLaMA 3 8b Scoreboard
|
||||||
|
|
||||||
Results are sorted by Kullback-Leibler divergence relative to FP16.
|
| Revision | f364eb6f |
|
||||||
|
|:---------|:-------------------|
|
||||||
|
| Backend | CUDA |
|
||||||
|
| CPU | AMD Epyc 7742 |
|
||||||
|
| GPU | 1x NVIDIA RTX 4090 |
|
||||||
|
|
||||||
|
Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
|
||||||
The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
|
The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
|
||||||
|
|
||||||
| Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp |
|
| Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp |
|
||||||
|
@ -89,6 +97,12 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
|
||||||
|
|
||||||
## LLaMA 2 vs. LLaMA 3 Quantization comparison
|
## LLaMA 2 vs. LLaMA 3 Quantization comparison
|
||||||
|
|
||||||
|
| Revision | f364eb6f |
|
||||||
|
|:---------|:-------------------|
|
||||||
|
| Backend | CUDA |
|
||||||
|
| CPU | AMD Epyc 7742 |
|
||||||
|
| GPU | 1x NVIDIA RTX 4090 |
|
||||||
|
|
||||||
| Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 |
|
| Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 |
|
||||||
|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
|
|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
|
||||||
| Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
|
| Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
|
||||||
|
@ -107,6 +121,50 @@ K-quants score better on mean Δp than the legacy quants than e.g. KL divergence
|
||||||
| RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % |
|
| RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % |
|
||||||
| Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % |
|
| Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % |
|
||||||
|
|
||||||
|
## LLaMA 3 BF16 vs. FP16 comparison
|
||||||
|
|
||||||
|
| Revision | 83330d8c |
|
||||||
|
|:---------|:--------------|
|
||||||
|
| Backend | CPU |
|
||||||
|
| CPU | AMD Epyc 7742 |
|
||||||
|
| GPU | N/A |
|
||||||
|
|
||||||
|
Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
|
||||||
|
|
||||||
|
| Metric | Value |
|
||||||
|
|--------------------------------|--------------------------|
|
||||||
|
| Mean PPL(Q) | 6.227711 ± 0.037833 |
|
||||||
|
| Mean PPL(base) | 6.225194 ± 0.037771 |
|
||||||
|
| Cor(ln(PPL(Q)), ln(PPL(base))) | 99.990% |
|
||||||
|
| Mean ln(PPL(Q)/PPL(base)) | 0.000404 ± 0.000086 |
|
||||||
|
| Mean PPL(Q)/PPL(base) | 1.000404 ± 0.000086 |
|
||||||
|
| Mean PPL(Q)-PPL(base) | 0.002517 ± 0.000536 |
|
||||||
|
| Mean KLD | 0.00002515 ± 0.00000020 |
|
||||||
|
| Maximum KLD | 0.012206 |
|
||||||
|
| 99.9% KLD | 0.000799 |
|
||||||
|
| 99.0% KLD | 0.000222 |
|
||||||
|
| 99.0% KLD | 0.000222 |
|
||||||
|
| Median KLD | 0.000013 |
|
||||||
|
| 10.0% KLD | -0.000002 |
|
||||||
|
| 5.0% KLD | -0.000008 |
|
||||||
|
| 1.0% KLD | -0.000023 |
|
||||||
|
| Minimum KLD | -0.000059 |
|
||||||
|
| Mean Δp | -0.0000745 ± 0.0003952 % |
|
||||||
|
| Maximum Δp | 4.186% |
|
||||||
|
| 99.9% Δp | 1.049% |
|
||||||
|
| 99.0% Δp | 0.439% |
|
||||||
|
| 95.0% Δp | 0.207% |
|
||||||
|
| 90.0% Δp | 0.125% |
|
||||||
|
| 75.0% Δp | 0.029% |
|
||||||
|
| Median Δp | 0.000% |
|
||||||
|
| 25.0% Δp | -0.030% |
|
||||||
|
| 10.0% Δp | -0.126% |
|
||||||
|
| 5.0% Δp | -0.207% |
|
||||||
|
| 1.0% Δp | -0.434% |
|
||||||
|
| 0.1% Δp | -1.016% |
|
||||||
|
| Minimum Δp | -4.672% |
|
||||||
|
| RMS Δp | 0.150 ± 0.001 % |
|
||||||
|
| Same top p | 99.739 ± 0.013 % |
|
||||||
|
|
||||||
## Old Numbers
|
## Old Numbers
|
||||||
|
|
||||||
|
|
2
examples/rpc/CMakeLists.txt
Normal file
2
examples/rpc/CMakeLists.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
add_executable(rpc-server rpc-server.cpp)
|
||||||
|
target_link_libraries(rpc-server PRIVATE ggml llama)
|
74
examples/rpc/README.md
Normal file
74
examples/rpc/README.md
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The `rpc-server` allows running `ggml` backend on a remote host.
|
||||||
|
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
||||||
|
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
rpcb---|TCP|srva
|
||||||
|
rpcb---|TCP|srvb
|
||||||
|
rpcb-.-|TCP|srvn
|
||||||
|
subgraph hostn[Host N]
|
||||||
|
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
|
||||||
|
end
|
||||||
|
subgraph hostb[Host B]
|
||||||
|
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
|
||||||
|
end
|
||||||
|
subgraph hosta[Host A]
|
||||||
|
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
|
||||||
|
end
|
||||||
|
subgraph host[Main Host]
|
||||||
|
ggml[llama.cpp]---rpcb[RPC backend]
|
||||||
|
end
|
||||||
|
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
|
||||||
|
```
|
||||||
|
|
||||||
|
Each host can run a different backend, e.g. one with CUDA and another with Metal.
|
||||||
|
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
|
||||||
|
For example, to build the CUDA backend with RPC support:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir build-rpc-cuda
|
||||||
|
cd build-rpc-cuda
|
||||||
|
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, start the `rpc-server` with the backend:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ bin/rpc-server 0.0.0.0 50052
|
||||||
|
create_backend: using CUDA backend
|
||||||
|
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||||
|
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
|
||||||
|
ggml_cuda_init: found 1 CUDA devices:
|
||||||
|
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
|
||||||
|
Starting RPC server on 0.0.0.0:50052
|
||||||
|
```
|
||||||
|
|
||||||
|
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
||||||
|
```bash
|
||||||
|
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
|
||||||
|
```
|
||||||
|
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||||
|
|
||||||
|
|
||||||
|
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir build-rpc
|
||||||
|
cd build-rpc
|
||||||
|
cmake .. -DLLAMA_RPC=ON
|
||||||
|
cmake --build . --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||||
|
```
|
70
examples/rpc/rpc-server.cpp
Normal file
70
examples/rpc/rpc-server.cpp
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "ggml-rpc.h"
|
||||||
|
#include <string>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
static ggml_backend_t create_backend() {
|
||||||
|
ggml_backend_t backend = NULL;
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||||
|
backend = ggml_backend_cuda_init(0); // init device 0
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||||
|
}
|
||||||
|
#elif GGML_USE_METAL
|
||||||
|
fprintf(stderr, "%s: using Metal backend\n", __func__);
|
||||||
|
backend = ggml_backend_metal_init();
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: using CPU backend\n", __func__);
|
||||||
|
backend = ggml_backend_cpu_init();
|
||||||
|
}
|
||||||
|
return backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||||
|
#else
|
||||||
|
// TODO: implement for other backends
|
||||||
|
*free_mem = 1;
|
||||||
|
*total_mem = 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char * argv[]) {
|
||||||
|
if (argc < 3) {
|
||||||
|
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
const char * host = argv[1];
|
||||||
|
int port = std::stoi(argv[2]);
|
||||||
|
if (port <= 0 || port > 65535) {
|
||||||
|
fprintf(stderr, "Invalid port number: %d\n", port);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
ggml_backend_t backend = create_backend();
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "Failed to create backend\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
printf("Starting RPC server on %s:%d\n", host, port);
|
||||||
|
size_t free_mem, total_mem;
|
||||||
|
get_backend_memory(&free_mem, &total_mem);
|
||||||
|
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
||||||
|
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||||
|
ggml_backend_free(backend);
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -48,7 +48,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||||
- `--path`: Path from which to serve static files. Default: disabled
|
- `--path`: Path from which to serve static files. Default: disabled
|
||||||
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
||||||
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
|
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
|
||||||
- `--embedding`: Enable embedding extraction. Default: disabled
|
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
|
||||||
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
|
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
|
||||||
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
|
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
|
||||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||||
|
|
|
@ -293,13 +293,14 @@ def start_server_background(args):
|
||||||
|
|
||||||
|
|
||||||
def is_server_listening(server_fqdn, server_port):
|
def is_server_listening(server_fqdn, server_port):
|
||||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
try:
|
||||||
result = sock.connect_ex((server_fqdn, server_port))
|
url = f"{server_fqdn}:{server_port}/health"
|
||||||
_is_server_listening = result == 0
|
if not url.startswith("http://"):
|
||||||
if _is_server_listening:
|
url = f"http://{url}"
|
||||||
print(f"server is listening on {server_fqdn}:{server_port}...")
|
result = requests.get(url)
|
||||||
return _is_server_listening
|
return result.status_code == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
def escape_metric_name(metric_name):
|
def escape_metric_name(metric_name):
|
||||||
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
||||||
|
|
|
@ -651,9 +651,6 @@ struct server_context {
|
||||||
std::string system_prompt;
|
std::string system_prompt;
|
||||||
std::vector<llama_token> system_tokens;
|
std::vector<llama_token> system_tokens;
|
||||||
|
|
||||||
std::string name_user; // this should be the antiprompt
|
|
||||||
std::string name_assistant;
|
|
||||||
|
|
||||||
// slots / clients
|
// slots / clients
|
||||||
std::vector<server_slot> slots;
|
std::vector<server_slot> slots;
|
||||||
json default_generation_settings_for_props;
|
json default_generation_settings_for_props;
|
||||||
|
@ -673,6 +670,15 @@ struct server_context {
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
model = nullptr;
|
model = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clear any sampling context
|
||||||
|
for (server_slot & slot : slots) {
|
||||||
|
if (slot.ctx_sampling != nullptr) {
|
||||||
|
llama_sampling_free(slot.ctx_sampling);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch_free(batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_model(const gpt_params & params_) {
|
bool load_model(const gpt_params & params_) {
|
||||||
|
@ -1098,15 +1104,11 @@ struct server_context {
|
||||||
system_need_update = false;
|
system_need_update = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void system_prompt_set(const json & sys_props) {
|
bool system_prompt_set(const std::string & sys_prompt) {
|
||||||
system_prompt = sys_props.value("prompt", "");
|
system_prompt = sys_prompt;
|
||||||
name_user = sys_props.value("anti_prompt", "");
|
|
||||||
name_assistant = sys_props.value("assistant_name", "");
|
|
||||||
|
|
||||||
LOG_VERBOSE("system prompt process", {
|
LOG_VERBOSE("system prompt process", {
|
||||||
{"system_prompt", system_prompt},
|
{"system_prompt", system_prompt},
|
||||||
{"name_user", name_user},
|
|
||||||
{"name_assistant", name_assistant},
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// release all slots
|
// release all slots
|
||||||
|
@ -1115,6 +1117,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
system_need_update = true;
|
system_need_update = true;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||||
|
@ -1534,7 +1537,8 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (task.data.contains("system_prompt")) {
|
if (task.data.contains("system_prompt")) {
|
||||||
system_prompt_set(task.data.at("system_prompt"));
|
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
|
||||||
|
system_prompt_set(sys_prompt);
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -2270,10 +2274,10 @@ struct server_context {
|
||||||
|
|
||||||
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
||||||
if (n_probs > 0) {
|
if (n_probs > 0) {
|
||||||
const size_t n_considered = slot.ctx_sampling->n_considered;
|
const size_t n_valid = slot.ctx_sampling->n_valid;
|
||||||
|
|
||||||
// Make sure at least n_probs top tokens are at the front of the vector:
|
// Make sure at least n_probs top tokens are at the front of the vector:
|
||||||
if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
|
if (slot.sparams.temp == 0.0f && n_probs > n_valid) {
|
||||||
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2289,7 +2293,7 @@ struct server_context {
|
||||||
for (size_t i = 0; i < n_probs; ++i) {
|
for (size_t i = 0; i < n_probs; ++i) {
|
||||||
result.probs.push_back({
|
result.probs.push_back({
|
||||||
cur_p.data[i].id,
|
cur_p.data[i].id,
|
||||||
i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
i >= n_valid ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2918,7 +2922,7 @@ int main(int argc, char ** argv) {
|
||||||
server_params_parse(argc, argv, sparams, params);
|
server_params_parse(argc, argv, sparams, params);
|
||||||
|
|
||||||
if (!sparams.system_prompt.empty()) {
|
if (!sparams.system_prompt.empty()) {
|
||||||
ctx_server.system_prompt_set(json::parse(sparams.system_prompt));
|
ctx_server.system_prompt_set(sparams.system_prompt);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.model_alias == "unknown") {
|
if (params.model_alias == "unknown") {
|
||||||
|
@ -3407,8 +3411,7 @@ int main(int argc, char ** argv) {
|
||||||
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
json data = {
|
json data = {
|
||||||
{ "user_name", ctx_server.name_user.c_str() },
|
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
||||||
{ "assistant_name", ctx_server.name_assistant.c_str() },
|
|
||||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
{ "total_slots", ctx_server.params.n_parallel }
|
{ "total_slots", ctx_server.params.n_parallel }
|
||||||
};
|
};
|
||||||
|
|
|
@ -887,6 +887,7 @@ async def oai_chat_completions(user_prompt,
|
||||||
base_path,
|
base_path,
|
||||||
async_client,
|
async_client,
|
||||||
debug=False,
|
debug=False,
|
||||||
|
temperature=None,
|
||||||
model=None,
|
model=None,
|
||||||
n_predict=None,
|
n_predict=None,
|
||||||
enable_streaming=None,
|
enable_streaming=None,
|
||||||
|
@ -913,7 +914,8 @@ async def oai_chat_completions(user_prompt,
|
||||||
"model": model,
|
"model": model,
|
||||||
"max_tokens": n_predict,
|
"max_tokens": n_predict,
|
||||||
"stream": enable_streaming,
|
"stream": enable_streaming,
|
||||||
"seed": seed
|
"temperature": temperature if temperature is not None else 0.0,
|
||||||
|
"seed": seed,
|
||||||
}
|
}
|
||||||
if response_format is not None:
|
if response_format is not None:
|
||||||
payload['response_format'] = response_format
|
payload['response_format'] = response_format
|
||||||
|
@ -978,7 +980,8 @@ async def oai_chat_completions(user_prompt,
|
||||||
max_tokens=n_predict,
|
max_tokens=n_predict,
|
||||||
stream=enable_streaming,
|
stream=enable_streaming,
|
||||||
response_format=payload.get('response_format'),
|
response_format=payload.get('response_format'),
|
||||||
seed=seed
|
seed=seed,
|
||||||
|
temperature=payload['temperature']
|
||||||
)
|
)
|
||||||
except openai.error.AuthenticationError as e:
|
except openai.error.AuthenticationError as e:
|
||||||
if expect_api_error is not None and expect_api_error:
|
if expect_api_error is not None and expect_api_error:
|
||||||
|
|
|
@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
|
||||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||||
llama_params["stream"] = json_value(body, "stream", false);
|
llama_params["stream"] = json_value(body, "stream", false);
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
||||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||||
|
|
||||||
// Apply chat template to the list of messages
|
// Apply chat template to the list of messages
|
||||||
|
|
|
@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
static char * fmt_size(size_t size) {
|
static char * fmt_size(size_t size) {
|
||||||
static char buffer[128];
|
static char buffer[128];
|
||||||
if (size >= 1024*1024) {
|
if (size >= 1024*1024) {
|
||||||
sprintf(buffer, "%zuM", size/1024/1024);
|
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
||||||
} else {
|
} else {
|
||||||
sprintf(buffer, "%zuK", size/1024);
|
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
||||||
}
|
}
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
|
||||||
|
|
||||||
tensor->buffer = buffer;
|
tensor->buffer = buffer;
|
||||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||||
tensor->backend = tensor->view_src->backend;
|
|
||||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
22
ggml-cuda.cu
22
ggml-cuda.cu
|
@ -4,7 +4,6 @@
|
||||||
|
|
||||||
#include "ggml-cuda/common.cuh"
|
#include "ggml-cuda/common.cuh"
|
||||||
#include "ggml-cuda/acc.cuh"
|
#include "ggml-cuda/acc.cuh"
|
||||||
#include "ggml-cuda/alibi.cuh"
|
|
||||||
#include "ggml-cuda/arange.cuh"
|
#include "ggml-cuda/arange.cuh"
|
||||||
#include "ggml-cuda/argsort.cuh"
|
#include "ggml-cuda/argsort.cuh"
|
||||||
#include "ggml-cuda/binbcast.cuh"
|
#include "ggml-cuda/binbcast.cuh"
|
||||||
|
@ -2205,6 +2204,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
ggml_cuda_op_relu(ctx, dst);
|
ggml_cuda_op_relu(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_UNARY_OP_SIGMOID:
|
||||||
|
ggml_cuda_op_sigmoid(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_UNARY_OP_HARDSIGMOID:
|
case GGML_UNARY_OP_HARDSIGMOID:
|
||||||
ggml_cuda_op_hardsigmoid(ctx, dst);
|
ggml_cuda_op_hardsigmoid(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2277,9 +2279,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
ggml_cuda_op_rope(ctx, dst);
|
ggml_cuda_op_rope(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
ggml_cuda_op_alibi(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
ggml_cuda_op_im2col(ctx, dst);
|
ggml_cuda_op_im2col(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2559,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
}
|
}
|
||||||
|
|
||||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||||
if (cuda_graph_update_required) {
|
if (use_cuda_graph && cuda_graph_update_required) {
|
||||||
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
||||||
} else {
|
} else {
|
||||||
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
||||||
|
@ -2714,12 +2713,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||||
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
|
case GGML_UNARY_OP_SIGMOID:
|
||||||
case GGML_UNARY_OP_HARDSIGMOID:
|
case GGML_UNARY_OP_HARDSIGMOID:
|
||||||
case GGML_UNARY_OP_HARDSWISH:
|
case GGML_UNARY_OP_HARDSWISH:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
|
@ -2829,7 +2830,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
|
@ -2841,8 +2841,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
case GGML_OP_FLASH_ATTN_EXT:
|
|
||||||
return true;
|
return true;
|
||||||
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
|
||||||
|
#else
|
||||||
|
if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,63 +0,0 @@
|
||||||
#include "alibi.cuh"
|
|
||||||
|
|
||||||
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
|
||||||
const int n_heads_log2_floor, const float m0, const float m1) {
|
|
||||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
|
||||||
|
|
||||||
if (col >= ncols) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
|
||||||
const int i = row*ncols + col;
|
|
||||||
|
|
||||||
const int k = row/k_rows;
|
|
||||||
|
|
||||||
float m_k;
|
|
||||||
if (k < n_heads_log2_floor) {
|
|
||||||
m_k = powf(m0, k + 1);
|
|
||||||
} else {
|
|
||||||
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
dst[i] = col * m_k + x[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
|
|
||||||
const int k_rows, const int n_heads_log2_floor, const float m0,
|
|
||||||
const float m1, cudaStream_t stream) {
|
|
||||||
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
|
|
||||||
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
|
|
||||||
const dim3 block_nums(num_blocks_x, nrows, 1);
|
|
||||||
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
|
||||||
const float * src0_d = (const float *)src0->data;
|
|
||||||
float * dst_d = (float *)dst->data;
|
|
||||||
cudaStream_t stream = ctx.stream();
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
|
||||||
const int64_t ne01 = src0->ne[1];
|
|
||||||
const int64_t ne02 = src0->ne[2];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
||||||
float max_bias;
|
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
||||||
|
|
||||||
//GGML_ASSERT(ne01 + n_past == ne00);
|
|
||||||
GGML_ASSERT(n_head == ne02);
|
|
||||||
|
|
||||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
||||||
|
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
|
||||||
|
|
||||||
alibi_f32_cuda(src0_d, dst_d, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, stream);
|
|
||||||
}
|
|
|
@ -1,5 +0,0 @@
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
#define CUDA_ALIBI_BLOCK_SIZE 32
|
|
||||||
|
|
||||||
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
|
@ -321,6 +321,10 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
|
|
||||||
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||||
|
|
||||||
|
static bool fast_fp16_available(const int cc) {
|
||||||
|
return cc >= CC_PASCAL && cc != 610;
|
||||||
|
}
|
||||||
|
|
||||||
static bool fp16_mma_available(const int cc) {
|
static bool fp16_mma_available(const int cc) {
|
||||||
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||||
}
|
}
|
||||||
|
|
47
ggml-cuda/fattn-common.cuh
Normal file
47
ggml-cuda/fattn-common.cuh
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
#define FATTN_KQ_STRIDE 256
|
||||||
|
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
||||||
|
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||||
|
|
||||||
|
template<int D, int parallel_blocks> // D == head size
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
__launch_bounds__(D, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
static __global__ void flash_attn_combine_results(
|
||||||
|
const float * __restrict__ VKQ_parts,
|
||||||
|
const float2 * __restrict__ VKQ_meta,
|
||||||
|
float * __restrict__ dst) {
|
||||||
|
VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
|
||||||
|
VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x;
|
||||||
|
dst += D * gridDim.y*blockIdx.x;
|
||||||
|
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
__builtin_assume(tid < D);
|
||||||
|
|
||||||
|
__shared__ float2 meta[parallel_blocks];
|
||||||
|
if (tid < 2*parallel_blocks) {
|
||||||
|
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
float kqmax = meta[0].x;
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 1; l < parallel_blocks; ++l) {
|
||||||
|
kqmax = max(kqmax, meta[l].x);
|
||||||
|
}
|
||||||
|
|
||||||
|
float VKQ_numerator = 0.0f;
|
||||||
|
float VKQ_denominator = 0.0f;
|
||||||
|
#pragma unroll
|
||||||
|
for (int l = 0; l < parallel_blocks; ++l) {
|
||||||
|
const float diff = meta[l].x - kqmax;
|
||||||
|
const float KQ_max_scale = expf(diff);
|
||||||
|
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
|
||||||
|
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
|
||||||
|
|
||||||
|
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
|
||||||
|
VKQ_denominator += KQ_max_scale * meta[l].y;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
|
||||||
|
}
|
430
ggml-cuda/fattn-vec-f16.cu
Normal file
430
ggml-cuda/fattn-vec-f16.cu
Normal file
|
@ -0,0 +1,430 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
#include "fattn-common.cuh"
|
||||||
|
#include "fattn-vec-f16.cuh"
|
||||||
|
|
||||||
|
template<int D, int ncols, int parallel_blocks> // D == head size
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
__launch_bounds__(D, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
static __global__ void flash_attn_vec_ext_f16(
|
||||||
|
const char * __restrict__ Q,
|
||||||
|
const char * __restrict__ K,
|
||||||
|
const char * __restrict__ V,
|
||||||
|
const char * __restrict__ mask,
|
||||||
|
float * __restrict__ dst,
|
||||||
|
float2 * __restrict__ dst_meta,
|
||||||
|
const float scale,
|
||||||
|
const float max_bias,
|
||||||
|
const float m0,
|
||||||
|
const float m1,
|
||||||
|
const uint32_t n_head_log2,
|
||||||
|
const int ne00,
|
||||||
|
const int ne01,
|
||||||
|
const int ne02,
|
||||||
|
const int ne03,
|
||||||
|
const int ne10,
|
||||||
|
const int ne11,
|
||||||
|
const int ne12,
|
||||||
|
const int ne13,
|
||||||
|
const int ne31,
|
||||||
|
const int nb31,
|
||||||
|
const int nb01,
|
||||||
|
const int nb02,
|
||||||
|
const int nb03,
|
||||||
|
const int nb11,
|
||||||
|
const int nb12,
|
||||||
|
const int nb13,
|
||||||
|
const int ne0,
|
||||||
|
const int ne1,
|
||||||
|
const int ne2,
|
||||||
|
const int ne3) {
|
||||||
|
#if FP16_AVAILABLE
|
||||||
|
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||||
|
|
||||||
|
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||||
|
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||||
|
|
||||||
|
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||||
|
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||||
|
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||||
|
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||||
|
const half * maskh = (const half *) mask + ne11*ic0;
|
||||||
|
|
||||||
|
const int stride_KV = nb11 / sizeof(half);
|
||||||
|
const int stride_KV2 = nb11 / sizeof(half2);
|
||||||
|
|
||||||
|
half slopeh = __float2half(1.0f);
|
||||||
|
|
||||||
|
// ALiBi
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const int h = blockIdx.y;
|
||||||
|
|
||||||
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
|
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
|
slopeh = __float2half(powf(base, exph));
|
||||||
|
}
|
||||||
|
|
||||||
|
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||||
|
constexpr int nwarps = D / WARP_SIZE;
|
||||||
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||||
|
__builtin_assume(tid < D);
|
||||||
|
|
||||||
|
__shared__ half KQ[ncols*D];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
KQ[j*D + tid] = -HALF_MAX_HALF;
|
||||||
|
}
|
||||||
|
half2 * KQ2 = (half2 *) KQ;
|
||||||
|
|
||||||
|
half kqmax[ncols];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqmax[j] = -HALF_MAX_HALF;
|
||||||
|
}
|
||||||
|
half kqsum[ncols] = {0.0f};
|
||||||
|
|
||||||
|
__shared__ half kqmax_shared[ncols][WARP_SIZE];
|
||||||
|
__shared__ half kqsum_shared[ncols][WARP_SIZE];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
if (threadIdx.y == 0) {
|
||||||
|
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
|
||||||
|
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Convert Q to half2 and store in registers:
|
||||||
|
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||||
|
const int i = i0 + threadIdx.x;
|
||||||
|
|
||||||
|
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||||
|
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||||
|
|
||||||
|
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
||||||
|
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
||||||
|
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||||
|
|
||||||
|
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||||
|
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
||||||
|
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
|
||||||
|
half kqmax_new = kqmax[0];
|
||||||
|
half kqmax_new_arr[ncols];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqmax_new_arr[j] = kqmax[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||||
|
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||||
|
|
||||||
|
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
half2 sum2[ncols] = {{0.0f, 0.0f}};
|
||||||
|
#pragma unroll
|
||||||
|
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||||
|
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||||
|
|
||||||
|
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
sum2[j] = warp_reduce_sum(sum2[j]);
|
||||||
|
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
|
||||||
|
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||||
|
|
||||||
|
if (ncols == 1) {
|
||||||
|
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
||||||
|
} else {
|
||||||
|
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
KQ[j*D + i_KQ] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
|
||||||
|
|
||||||
|
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||||
|
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||||
|
|
||||||
|
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
|
||||||
|
kqmax[j] = kqmax_new_j;
|
||||||
|
|
||||||
|
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
|
||||||
|
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||||
|
KQ[j*D + tid] = val;
|
||||||
|
|
||||||
|
VKQ[j] *= __half2half2(KQ_max_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int k0 = 0; k0 < D; k0 += 2) {
|
||||||
|
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
half2 V_k;
|
||||||
|
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
|
||||||
|
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||||
|
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||||
|
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||||
|
|
||||||
|
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
||||||
|
if (parallel_blocks == 1) {
|
||||||
|
dst_val /= kqsum[j_VKQ];
|
||||||
|
}
|
||||||
|
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||||
|
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parallel_blocks != 1 && tid != 0) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // FP16_AVAILABLE
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
|
||||||
|
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||||
|
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||||
|
) {
|
||||||
|
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||||
|
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||||
|
|
||||||
|
if (parallel_blocks > 1) {
|
||||||
|
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||||
|
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||||
|
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||||
|
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||||
|
const int shmem = 0;
|
||||||
|
|
||||||
|
float scale = 1.0f;
|
||||||
|
float max_bias = 0.0f;
|
||||||
|
|
||||||
|
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||||
|
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
|
const uint32_t n_head = Q->ne[2];
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
|
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
|
||||||
|
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||||
|
(const char *) Q->data,
|
||||||
|
(const char *) K->data,
|
||||||
|
(const char *) V->data,
|
||||||
|
mask ? ((const char *) mask->data) : nullptr,
|
||||||
|
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||||
|
scale, max_bias, m0, m1, n_head_log2,
|
||||||
|
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||||
|
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||||
|
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||||
|
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||||
|
K->nb[1], K->nb[2], K->nb[3],
|
||||||
|
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||||
|
);
|
||||||
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
|
if (parallel_blocks == 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const dim3 block_dim_combine(D, 1, 1);
|
||||||
|
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||||
|
const int shmem_combine = 0;
|
||||||
|
|
||||||
|
flash_attn_combine_results<D, parallel_blocks>
|
||||||
|
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||||
|
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||||
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * Q = dst->src[0];
|
||||||
|
const ggml_tensor * K = dst->src[1];
|
||||||
|
const ggml_tensor * V = dst->src[2];
|
||||||
|
|
||||||
|
const ggml_tensor * mask = dst->src[3];
|
||||||
|
|
||||||
|
ggml_tensor * KQV = dst;
|
||||||
|
|
||||||
|
const int32_t precision = KQV->op_params[2];
|
||||||
|
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||||
|
|
||||||
|
constexpr int cols_per_block = 1;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 256:
|
||||||
|
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * Q = dst->src[0];
|
||||||
|
const ggml_tensor * K = dst->src[1];
|
||||||
|
const ggml_tensor * V = dst->src[2];
|
||||||
|
|
||||||
|
const ggml_tensor * mask = dst->src[3];
|
||||||
|
|
||||||
|
ggml_tensor * KQV = dst;
|
||||||
|
|
||||||
|
const int32_t precision = KQV->op_params[2];
|
||||||
|
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||||
|
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||||
|
|
||||||
|
if (Q->ne[1] == 1) {
|
||||||
|
constexpr int cols_per_block = 1;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] == 2) {
|
||||||
|
constexpr int cols_per_block = 2;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] <= 4) {
|
||||||
|
constexpr int cols_per_block = 4;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] <= 8) {
|
||||||
|
constexpr int cols_per_block = 8;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int cols_per_block = 8;
|
||||||
|
constexpr int parallel_blocks = 1;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
5
ggml-cuda/fattn-vec-f16.cuh
Normal file
5
ggml-cuda/fattn-vec-f16.cuh
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
|
void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
384
ggml-cuda/fattn-vec-f32.cu
Normal file
384
ggml-cuda/fattn-vec-f32.cu
Normal file
|
@ -0,0 +1,384 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
#include "fattn-common.cuh"
|
||||||
|
#include "fattn-vec-f32.cuh"
|
||||||
|
|
||||||
|
template<int D, int ncols, int parallel_blocks> // D == head size
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
__launch_bounds__(D, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
static __global__ void flash_attn_vec_ext_f32(
|
||||||
|
const char * __restrict__ Q,
|
||||||
|
const char * __restrict__ K,
|
||||||
|
const char * __restrict__ V,
|
||||||
|
const char * __restrict__ mask,
|
||||||
|
float * __restrict__ dst,
|
||||||
|
float2 * __restrict__ dst_meta,
|
||||||
|
const float scale,
|
||||||
|
const float max_bias,
|
||||||
|
const float m0,
|
||||||
|
const float m1,
|
||||||
|
const uint32_t n_head_log2,
|
||||||
|
const int ne00,
|
||||||
|
const int ne01,
|
||||||
|
const int ne02,
|
||||||
|
const int ne03,
|
||||||
|
const int ne10,
|
||||||
|
const int ne11,
|
||||||
|
const int ne12,
|
||||||
|
const int ne13,
|
||||||
|
const int ne31,
|
||||||
|
const int nb31,
|
||||||
|
const int nb01,
|
||||||
|
const int nb02,
|
||||||
|
const int nb03,
|
||||||
|
const int nb11,
|
||||||
|
const int nb12,
|
||||||
|
const int nb13,
|
||||||
|
const int ne0,
|
||||||
|
const int ne1,
|
||||||
|
const int ne2,
|
||||||
|
const int ne3) {
|
||||||
|
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||||
|
|
||||||
|
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||||
|
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||||
|
|
||||||
|
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||||
|
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||||
|
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||||
|
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||||
|
const half * maskh = (const half *) mask + ne11*ic0;
|
||||||
|
|
||||||
|
const int stride_KV = nb11 / sizeof(half);
|
||||||
|
const int stride_KV2 = nb11 / sizeof(half2);
|
||||||
|
|
||||||
|
float slope = 1.0f;
|
||||||
|
|
||||||
|
// ALiBi
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const int h = blockIdx.y;
|
||||||
|
|
||||||
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
|
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
|
slope = powf(base, exph);
|
||||||
|
}
|
||||||
|
|
||||||
|
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||||
|
constexpr int nwarps = D / WARP_SIZE;
|
||||||
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||||
|
__builtin_assume(tid < D);
|
||||||
|
|
||||||
|
__shared__ float KQ[ncols*D];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
KQ[j*D + tid] = -FLT_MAX/2.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
float kqmax[ncols];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqmax[j] = -FLT_MAX/2.0f;
|
||||||
|
}
|
||||||
|
float kqsum[ncols] = {0.0f};
|
||||||
|
|
||||||
|
__shared__ float kqmax_shared[ncols][WARP_SIZE];
|
||||||
|
__shared__ float kqsum_shared[ncols][WARP_SIZE];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
if (threadIdx.y == 0) {
|
||||||
|
kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
|
||||||
|
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// Convert Q to half2 and store in registers:
|
||||||
|
float2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||||
|
const int i = i0 + threadIdx.x;
|
||||||
|
|
||||||
|
Q_h2[j][i0/WARP_SIZE] = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||||
|
Q_h2[j][i0/WARP_SIZE].x *= scale;
|
||||||
|
Q_h2[j][i0/WARP_SIZE].y *= scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float VKQ[ncols] = {0.0f};
|
||||||
|
|
||||||
|
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
||||||
|
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
||||||
|
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||||
|
|
||||||
|
float kqmax_new_arr[ncols];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqmax_new_arr[j] = kqmax[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||||
|
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||||
|
|
||||||
|
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
float sum[ncols] = {0.0f};
|
||||||
|
#pragma unroll
|
||||||
|
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||||
|
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||||
|
|
||||||
|
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
sum[j] += __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x;
|
||||||
|
sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
sum[j] = warp_reduce_sum(sum[j]);
|
||||||
|
sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||||
|
|
||||||
|
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]);
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
KQ[j*D + i_KQ] = sum[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
float kqmax_new_j = kqmax_new_arr[j];
|
||||||
|
|
||||||
|
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||||
|
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||||
|
|
||||||
|
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
|
||||||
|
kqmax[j] = kqmax_new_j;
|
||||||
|
|
||||||
|
const float val = expf(KQ[j*D + tid] - kqmax[j]);
|
||||||
|
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||||
|
KQ[j*D + tid] = val;
|
||||||
|
|
||||||
|
VKQ[j] *= KQ_max_scale;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int k = 0; k < D; ++k) {
|
||||||
|
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]);
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
VKQ[j] += V_ki*KQ[j*D + k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||||
|
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||||
|
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||||
|
|
||||||
|
float dst_val = VKQ[j_VKQ];
|
||||||
|
if (parallel_blocks == 1) {
|
||||||
|
dst_val /= kqsum[j_VKQ];
|
||||||
|
}
|
||||||
|
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||||
|
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parallel_blocks != 1 && tid != 0) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f32(
|
||||||
|
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||||
|
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||||
|
) {
|
||||||
|
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||||
|
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||||
|
|
||||||
|
if (parallel_blocks > 1) {
|
||||||
|
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||||
|
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||||
|
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||||
|
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||||
|
const int shmem = 0;
|
||||||
|
|
||||||
|
float scale = 1.0f;
|
||||||
|
float max_bias = 0.0f;
|
||||||
|
|
||||||
|
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||||
|
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
|
const uint32_t n_head = Q->ne[2];
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
|
flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>
|
||||||
|
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||||
|
(const char *) Q->data,
|
||||||
|
(const char *) K->data,
|
||||||
|
(const char *) V->data,
|
||||||
|
mask ? ((const char *) mask->data) : nullptr,
|
||||||
|
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||||
|
scale, max_bias, m0, m1, n_head_log2,
|
||||||
|
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||||
|
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||||
|
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||||
|
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||||
|
K->nb[1], K->nb[2], K->nb[3],
|
||||||
|
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||||
|
);
|
||||||
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
|
if (parallel_blocks == 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const dim3 block_dim_combine(D, 1, 1);
|
||||||
|
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||||
|
const int shmem_combine = 0;
|
||||||
|
|
||||||
|
flash_attn_combine_results<D, parallel_blocks>
|
||||||
|
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||||
|
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||||
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * Q = dst->src[0];
|
||||||
|
const ggml_tensor * K = dst->src[1];
|
||||||
|
const ggml_tensor * V = dst->src[2];
|
||||||
|
|
||||||
|
const ggml_tensor * mask = dst->src[3];
|
||||||
|
|
||||||
|
ggml_tensor * KQV = dst;
|
||||||
|
|
||||||
|
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||||
|
|
||||||
|
if (Q->ne[1] == 1) {
|
||||||
|
constexpr int cols_per_block = 1;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] == 2) {
|
||||||
|
constexpr int cols_per_block = 2;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] <= 4) {
|
||||||
|
constexpr int cols_per_block = 4;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] <= 8) {
|
||||||
|
constexpr int cols_per_block = 8;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int cols_per_block = 8;
|
||||||
|
constexpr int parallel_blocks = 1;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
3
ggml-cuda/fattn-vec-f32.cuh
Normal file
3
ggml-cuda/fattn-vec-f32.cuh
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -1,4 +1,7 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
#include "fattn-common.cuh"
|
||||||
|
#include "fattn-vec-f16.cuh"
|
||||||
|
#include "fattn-vec-f32.cuh"
|
||||||
#include "fattn.cuh"
|
#include "fattn.cuh"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
@ -7,235 +10,6 @@
|
||||||
#include <mma.h>
|
#include <mma.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define FATTN_KQ_STRIDE 256
|
|
||||||
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
|
||||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
|
||||||
|
|
||||||
template<int D, int ncols, int parallel_blocks> // D == head size
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
__launch_bounds__(D, 1)
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
static __global__ void flash_attn_vec_ext_f16(
|
|
||||||
const char * __restrict__ Q,
|
|
||||||
const char * __restrict__ K,
|
|
||||||
const char * __restrict__ V,
|
|
||||||
const char * __restrict__ mask,
|
|
||||||
float * __restrict__ dst,
|
|
||||||
float2 * __restrict__ dst_meta,
|
|
||||||
const float scale,
|
|
||||||
const int ne00,
|
|
||||||
const int ne01,
|
|
||||||
const int ne02,
|
|
||||||
const int ne03,
|
|
||||||
const int ne10,
|
|
||||||
const int ne11,
|
|
||||||
const int ne12,
|
|
||||||
const int ne13,
|
|
||||||
const int ne31,
|
|
||||||
const int nb31,
|
|
||||||
const int nb01,
|
|
||||||
const int nb02,
|
|
||||||
const int nb03,
|
|
||||||
const int nb11,
|
|
||||||
const int nb12,
|
|
||||||
const int nb13,
|
|
||||||
const int ne0,
|
|
||||||
const int ne1,
|
|
||||||
const int ne2,
|
|
||||||
const int ne3) {
|
|
||||||
#if FP16_AVAILABLE
|
|
||||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
|
||||||
|
|
||||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
|
||||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
|
||||||
|
|
||||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
|
||||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
|
||||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
|
||||||
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
|
||||||
const half * maskh = (const half *) mask + ne11*ic0;
|
|
||||||
|
|
||||||
const int stride_KV = nb11 / sizeof(half);
|
|
||||||
const int stride_KV2 = nb11 / sizeof(half2);
|
|
||||||
|
|
||||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
|
||||||
constexpr int nwarps = D / WARP_SIZE;
|
|
||||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
|
||||||
__builtin_assume(tid < D);
|
|
||||||
|
|
||||||
__shared__ half KQ[ncols*D];
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
KQ[j*D + tid] = -HALF_MAX_HALF;
|
|
||||||
}
|
|
||||||
half2 * KQ2 = (half2 *) KQ;
|
|
||||||
|
|
||||||
half kqmax[ncols];
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
kqmax[j] = -HALF_MAX_HALF;
|
|
||||||
}
|
|
||||||
half kqsum[ncols] = {0.0f};
|
|
||||||
|
|
||||||
__shared__ half kqmax_shared[ncols][WARP_SIZE];
|
|
||||||
__shared__ half kqsum_shared[ncols][WARP_SIZE];
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
if (threadIdx.y == 0) {
|
|
||||||
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
|
|
||||||
kqsum_shared[j][threadIdx.x] = 0.0f;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
// Convert Q to half2 and store in registers:
|
|
||||||
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
|
||||||
const int i = i0 + threadIdx.x;
|
|
||||||
|
|
||||||
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
|
||||||
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
|
||||||
|
|
||||||
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
|
||||||
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
|
||||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
|
||||||
|
|
||||||
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
|
||||||
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
|
||||||
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
|
|
||||||
half kqmax_new = kqmax[0];
|
|
||||||
half kqmax_new_arr[ncols];
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
kqmax_new_arr[j] = kqmax[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
|
||||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
|
||||||
|
|
||||||
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
half2 sum2[ncols] = {{0.0f, 0.0f}};
|
|
||||||
#pragma unroll
|
|
||||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
|
||||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
|
||||||
|
|
||||||
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
sum2[j] = warp_reduce_sum(sum2[j]);
|
|
||||||
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
|
|
||||||
sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
|
||||||
|
|
||||||
if (ncols == 1) {
|
|
||||||
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
|
||||||
} else {
|
|
||||||
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
KQ[j*D + i_KQ] = sum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
|
|
||||||
|
|
||||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
|
||||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
|
||||||
|
|
||||||
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
|
|
||||||
kqmax[j] = kqmax_new_j;
|
|
||||||
|
|
||||||
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
|
|
||||||
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
|
||||||
KQ[j*D + tid] = val;
|
|
||||||
|
|
||||||
VKQ[j] *= __half2half2(KQ_max_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int k0 = 0; k0 < D; k0 += 2) {
|
|
||||||
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
half2 V_k;
|
|
||||||
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
|
|
||||||
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
|
||||||
if (threadIdx.x == 0) {
|
|
||||||
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
|
||||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
|
||||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
|
||||||
|
|
||||||
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
|
||||||
if (parallel_blocks == 1) {
|
|
||||||
dst_val /= kqsum[j_VKQ];
|
|
||||||
}
|
|
||||||
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
|
||||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (parallel_blocks != 1 && tid != 0) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // FP16_AVAILABLE
|
|
||||||
}
|
|
||||||
|
|
||||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
@ -249,6 +23,10 @@ static __global__ void flash_attn_ext_f16(
|
||||||
float * __restrict__ dst,
|
float * __restrict__ dst,
|
||||||
float2 * __restrict__ dst_meta,
|
float2 * __restrict__ dst_meta,
|
||||||
const float scale,
|
const float scale,
|
||||||
|
const float max_bias,
|
||||||
|
const float m0,
|
||||||
|
const float m1,
|
||||||
|
const uint32_t n_head_log2,
|
||||||
const int ne00,
|
const int ne00,
|
||||||
const int ne01,
|
const int ne01,
|
||||||
const int ne02,
|
const int ne02,
|
||||||
|
@ -305,6 +83,20 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const int stride_Q = nb01 / sizeof(float);
|
const int stride_Q = nb01 / sizeof(float);
|
||||||
const int stride_KV = nb11 / sizeof(half);
|
const int stride_KV = nb11 / sizeof(half);
|
||||||
|
|
||||||
|
half slopeh = __float2half(1.0f);
|
||||||
|
half2 slope2 = make_half2(1.0f, 1.0f);
|
||||||
|
|
||||||
|
// ALiBi
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const int h = blockIdx.y;
|
||||||
|
|
||||||
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
|
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
|
slopeh = __float2half(powf(base, exph));
|
||||||
|
slope2 = make_half2(slopeh, slopeh);
|
||||||
|
}
|
||||||
|
|
||||||
frag_b Q_b[D/16][ncols/frag_n];
|
frag_b Q_b[D/16][ncols/frag_n];
|
||||||
|
|
||||||
// A single buffer for temporarily holding tiles of KQ and VKQ parts:
|
// A single buffer for temporarily holding tiles of KQ and VKQ parts:
|
||||||
|
@ -421,7 +213,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
|
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
|
||||||
const int k = k0 + threadIdx.x;
|
const int k = k0 + threadIdx.x;
|
||||||
|
|
||||||
KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
|
KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
|
||||||
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
|
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
|
||||||
}
|
}
|
||||||
KQ_max_new = warp_reduce_max(KQ_max_new);
|
KQ_max_new = warp_reduce_max(KQ_max_new);
|
||||||
|
@ -464,7 +256,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
|
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
|
||||||
const int k = k0 + threadIdx.x;
|
const int k = k0 + threadIdx.x;
|
||||||
|
|
||||||
KQ2_tmp[k0/WARP_SIZE] += mask ? mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
|
KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
|
||||||
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
|
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
|
||||||
}
|
}
|
||||||
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
|
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
|
||||||
|
@ -621,54 +413,6 @@ static __global__ void flash_attn_ext_f16(
|
||||||
#endif // FP16_MMA_AVAILABLE
|
#endif // FP16_MMA_AVAILABLE
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int D, int parallel_blocks> // D == head size
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
__launch_bounds__(D, 1)
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
static __global__ void flash_attn_combine_results(
|
|
||||||
const float * __restrict__ VKQ_parts,
|
|
||||||
const float2 * __restrict__ VKQ_meta,
|
|
||||||
float * __restrict__ dst) {
|
|
||||||
#if FP16_AVAILABLE
|
|
||||||
VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
|
|
||||||
VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x;
|
|
||||||
dst += D * gridDim.y*blockIdx.x;
|
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
|
||||||
__builtin_assume(tid < D);
|
|
||||||
|
|
||||||
__shared__ float2 meta[parallel_blocks];
|
|
||||||
if (tid < 2*parallel_blocks) {
|
|
||||||
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
|
|
||||||
}
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
float kqmax = meta[0].x;
|
|
||||||
#pragma unroll
|
|
||||||
for (int l = 1; l < parallel_blocks; ++l) {
|
|
||||||
kqmax = max(kqmax, meta[l].x);
|
|
||||||
}
|
|
||||||
|
|
||||||
float VKQ_numerator = 0.0f;
|
|
||||||
float VKQ_denominator = 0.0f;
|
|
||||||
#pragma unroll
|
|
||||||
for (int l = 0; l < parallel_blocks; ++l) {
|
|
||||||
const float diff = meta[l].x - kqmax;
|
|
||||||
const float KQ_max_scale = expf(diff);
|
|
||||||
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
|
|
||||||
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
|
|
||||||
|
|
||||||
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
|
|
||||||
VKQ_denominator += KQ_max_scale * meta[l].y;
|
|
||||||
}
|
|
||||||
|
|
||||||
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
|
|
||||||
#else
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // FP16_AVAILABLE
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int get_max_power_of_2(int x) {
|
constexpr int get_max_power_of_2(int x) {
|
||||||
return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
|
return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
|
||||||
}
|
}
|
||||||
|
@ -693,57 +437,6 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
|
||||||
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
|
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
|
||||||
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
||||||
|
|
||||||
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
|
|
||||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
|
||||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
|
||||||
) {
|
|
||||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
|
||||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
|
||||||
|
|
||||||
if (parallel_blocks > 1) {
|
|
||||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
|
||||||
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
|
||||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
|
||||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
|
||||||
const int shmem = 0;
|
|
||||||
|
|
||||||
float scale;
|
|
||||||
memcpy(&scale, KQV->op_params, sizeof(float));
|
|
||||||
|
|
||||||
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
|
|
||||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
|
||||||
(const char *) Q->data,
|
|
||||||
(const char *) K->data,
|
|
||||||
(const char *) V->data,
|
|
||||||
mask ? ((const char *) mask->data) : nullptr,
|
|
||||||
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
|
||||||
scale,
|
|
||||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
|
||||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
|
||||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
|
||||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
|
||||||
K->nb[1], K->nb[2], K->nb[3],
|
|
||||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
|
||||||
);
|
|
||||||
CUDA_CHECK(cudaGetLastError());
|
|
||||||
|
|
||||||
if (parallel_blocks == 1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const dim3 block_dim_combine(D, 1, 1);
|
|
||||||
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
|
||||||
const int shmem_combine = 0;
|
|
||||||
|
|
||||||
flash_attn_combine_results<D, parallel_blocks>
|
|
||||||
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
|
||||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
|
||||||
CUDA_CHECK(cudaGetLastError());
|
|
||||||
}
|
|
||||||
|
|
||||||
template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename KQ_acc_t> void launch_fattn_f16_impl(
|
template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename KQ_acc_t> void launch_fattn_f16_impl(
|
||||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||||
|
@ -761,8 +454,17 @@ template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename K
|
||||||
const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
|
const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
|
||||||
const int shmem = 0;
|
const int shmem = 0;
|
||||||
|
|
||||||
float scale;
|
float scale = 1.0f;
|
||||||
memcpy(&scale, KQV->op_params, sizeof(float));
|
float max_bias = 0.0f;
|
||||||
|
|
||||||
|
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||||
|
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
|
const uint32_t n_head = Q->ne[2];
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>
|
flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>
|
||||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||||
|
@ -771,7 +473,7 @@ template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename K
|
||||||
(const char *) V->data,
|
(const char *) V->data,
|
||||||
mask ? ((const char *) mask->data) : nullptr,
|
mask ? ((const char *) mask->data) : nullptr,
|
||||||
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||||
scale,
|
scale, max_bias, m0, m1, n_head_log2,
|
||||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||||
|
@ -837,97 +539,24 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||||
|
|
||||||
const int32_t precision = KQV->op_params[1];
|
const int32_t precision = KQV->op_params[2];
|
||||||
|
|
||||||
|
if (!fast_fp16_available(cc)) {
|
||||||
|
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (!fp16_mma_available(cc)) {
|
if (!fp16_mma_available(cc)) {
|
||||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
|
||||||
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
|
||||||
|
|
||||||
if (Q->ne[1] == 1) {
|
|
||||||
constexpr int cols_per_block = 1;
|
|
||||||
constexpr int parallel_blocks = 4;
|
|
||||||
switch (Q->ne[0]) {
|
|
||||||
case 64:
|
|
||||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Q->ne[1] == 2) {
|
|
||||||
constexpr int cols_per_block = 2;
|
|
||||||
constexpr int parallel_blocks = 4;
|
|
||||||
switch (Q->ne[0]) {
|
|
||||||
case 64:
|
|
||||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Q->ne[1] <= 4) {
|
|
||||||
constexpr int cols_per_block = 4;
|
|
||||||
constexpr int parallel_blocks = 4;
|
|
||||||
switch (Q->ne[0]) {
|
|
||||||
case 64:
|
|
||||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Q->ne[1] <= 8) {
|
|
||||||
constexpr int cols_per_block = 8;
|
|
||||||
constexpr int parallel_blocks = 4;
|
|
||||||
switch (Q->ne[0]) {
|
|
||||||
case 64:
|
|
||||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
constexpr int cols_per_block = 8;
|
|
||||||
constexpr int parallel_blocks = 1;
|
|
||||||
switch (Q->ne[0]) {
|
|
||||||
case 64:
|
|
||||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (precision != GGML_PREC_DEFAULT) {
|
if (precision != GGML_PREC_DEFAULT) {
|
||||||
|
if (Q->ne[1] == 1 && (Q->ne[0] == 64 || Q->ne[0] == 128)) {
|
||||||
|
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
||||||
constexpr int cols_per_block = 16;
|
constexpr int cols_per_block = 16;
|
||||||
constexpr int nwarps = 4;
|
constexpr int nwarps = 4;
|
||||||
|
@ -985,22 +614,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
||||||
constexpr int cols_per_block = 1;
|
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||||
constexpr int parallel_blocks = 4;
|
|
||||||
switch (Q->ne[0]) {
|
|
||||||
case 64:
|
|
||||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
case 128:
|
|
||||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
case 256:
|
|
||||||
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool vals_smem, int ncols_template, int block_size_template, typename T>
|
template <bool vals_smem, int ncols_template, int block_size_template, typename T>
|
||||||
static __global__ void soft_max_f32(const float * x, const T * mask, const T * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
|
static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
|
||||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
|
@ -23,16 +23,16 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
|
||||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||||
|
|
||||||
float slope = 0.0f;
|
float slope = 1.0f;
|
||||||
|
|
||||||
// ALiBi
|
// ALiBi
|
||||||
if (max_bias > 0.0f) {
|
if (max_bias > 0.0f) {
|
||||||
const int h = rowx/nrows_y; // head index
|
const int h = rowx/nrows_y; // head index
|
||||||
|
|
||||||
const float base = h < n_head_log2 ? m0 : m1;
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
slope = powf(base, exp);
|
slope = powf(base, exph);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __shared__ float data_soft_max_f32[];
|
extern __shared__ float data_soft_max_f32[];
|
||||||
|
@ -53,7 +53,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
|
||||||
const int64_t ix = (int64_t)rowx*ncols + col;
|
const int64_t ix = (int64_t)rowx*ncols + col;
|
||||||
const int64_t iy = (int64_t)rowy*ncols + col;
|
const int64_t iy = (int64_t)rowy*ncols + col;
|
||||||
|
|
||||||
const float val = x[ix]*scale + (mask ? t2f32(mask[iy]) : 0.0f) + (pos ? slope*t2f32(pos[col]) : 0.0f);
|
const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f);
|
||||||
|
|
||||||
vals[col] = val;
|
vals[col] = val;
|
||||||
max_val = max(max_val, val);
|
max_val = max(max_val, val);
|
||||||
|
@ -125,7 +125,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
|
static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
|
||||||
int nth = WARP_SIZE;
|
int nth = WARP_SIZE;
|
||||||
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||||
const dim3 block_dims(nth, 1, 1);
|
const dim3 block_dims(nth, 1, 1);
|
||||||
|
@ -133,8 +133,8 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, fl
|
||||||
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
|
||||||
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
|
||||||
|
|
||||||
const uint32_t n_head_kv = nrows_x/nrows_y;
|
const uint32_t n_head = nrows_x/nrows_y;
|
||||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||||
|
|
||||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
@ -142,43 +142,42 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, fl
|
||||||
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
|
||||||
switch (ncols_x) {
|
switch (ncols_x) {
|
||||||
case 32:
|
case 32:
|
||||||
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 64:
|
case 64:
|
||||||
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 128:
|
case 128:
|
||||||
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 256:
|
case 256:
|
||||||
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 512:
|
case 512:
|
||||||
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 1024:
|
case 1024:
|
||||||
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 2048:
|
case 2048:
|
||||||
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
case 4096:
|
case 4096:
|
||||||
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const size_t shmem_low = WARP_SIZE*sizeof(float);
|
const size_t shmem_low = WARP_SIZE*sizeof(float);
|
||||||
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const ggml_tensor * src1 = dst->src[1];
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
const ggml_tensor * src2 = dst->src[2];
|
|
||||||
|
|
||||||
const float * src0_d = (const float *)src0->data;
|
const float * src0_d = (const float *)src0->data;
|
||||||
const void * src1_d = src1 ? (const void *)src1->data : nullptr;
|
const void * src1_d = src1 ? (const void *)src1->data : nullptr;
|
||||||
|
@ -190,7 +189,6 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||||
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t nrows_x = ggml_nrows(src0);
|
const int64_t nrows_x = ggml_nrows(src0);
|
||||||
|
@ -202,26 +200,15 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||||
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
// positions tensor
|
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||||
void * src2_d = nullptr;
|
|
||||||
|
|
||||||
const bool use_src2 = src2 != nullptr;
|
|
||||||
|
|
||||||
if (use_src2) {
|
|
||||||
src2_d = (void *)src2->data;
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
|
||||||
|
|
||||||
if (use_f16) {
|
if (use_f16) {
|
||||||
const half * src1_dd = (const half *)src1_d;
|
const half * src1_dd = (const half *)src1_d;
|
||||||
const half * src2_dd = (const half *)src2_d;
|
|
||||||
|
|
||||||
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
||||||
} else {
|
} else {
|
||||||
const float * src1_dd = (const float *)src1_d;
|
const float * src1_dd = (const float *)src1_d;
|
||||||
const float * src2_dd = (const float *)src2_d;
|
|
||||||
|
|
||||||
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,6 +48,15 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
||||||
dst[i] = fmaxf(x[i], 0);
|
dst[i] = fmaxf(x[i], 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __global__ void sigmoid_f32(const float * x, float * dst, const int k) {
|
||||||
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
dst[i] = 1.0f / (1.0f + expf(-x[i]));
|
||||||
|
}
|
||||||
|
|
||||||
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
|
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
@ -108,6 +117,11 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
||||||
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||||
|
const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
|
||||||
|
sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
|
}
|
||||||
|
|
||||||
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
|
||||||
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
|
@ -188,6 +202,18 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const float * src0_d = (const float *)src0->data;
|
||||||
|
float * dst_d = (float *)dst->data;
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const float * src0_d = (const float *)src0->data;
|
const float * src0_d = (const float *)src0->data;
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
#define CUDA_SILU_BLOCK_SIZE 256
|
#define CUDA_SILU_BLOCK_SIZE 256
|
||||||
#define CUDA_TANH_BLOCK_SIZE 256
|
#define CUDA_TANH_BLOCK_SIZE 256
|
||||||
#define CUDA_RELU_BLOCK_SIZE 256
|
#define CUDA_RELU_BLOCK_SIZE 256
|
||||||
|
#define CUDA_SIGMOID_BLOCK_SIZE 256
|
||||||
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
|
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
|
||||||
#define CUDA_HARDSWISH_BLOCK_SIZE 256
|
#define CUDA_HARDSWISH_BLOCK_SIZE 256
|
||||||
#define CUDA_SQR_BLOCK_SIZE 256
|
#define CUDA_SQR_BLOCK_SIZE 256
|
||||||
|
@ -18,6 +19,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
|
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
|
@ -1,35 +1,36 @@
|
||||||
#include "upscale.cuh"
|
#include "upscale.cuh"
|
||||||
|
|
||||||
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
|
static __global__ void upscale_f32(const float * x, float * dst,
|
||||||
// blockIdx.z: idx of ne02*ne03
|
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||||
// blockIdx.y: idx of ne01*scale_factor, aka ne1
|
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||||
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
|
const float sf0, const float sf1, const float sf2, const float sf3) {
|
||||||
// ne00xne01: ne00 * ne01
|
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int ne0 = ne00 * scale_factor;
|
if (index >= ne10 * ne11 * ne12 * ne13) {
|
||||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
|
||||||
if (nidx >= ne0) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// operation
|
|
||||||
int i00 = nidx / scale_factor;
|
int i10 = index % ne10;
|
||||||
int i01 = blockIdx.y / scale_factor;
|
int i11 = (index / ne10) % ne11;
|
||||||
int offset_src =
|
int i12 = (index / (ne10 * ne11)) % ne12;
|
||||||
i00 +
|
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
||||||
i01 * ne00 +
|
|
||||||
blockIdx.z * ne00xne01;
|
int i00 = i10 / sf0;
|
||||||
int offset_dst =
|
int i01 = i11 / sf1;
|
||||||
nidx +
|
int i02 = i12 / sf2;
|
||||||
blockIdx.y * ne0 +
|
int i03 = i13 / sf3;
|
||||||
blockIdx.z * ne0 * gridDim.y;
|
|
||||||
dst[offset_dst] = x[offset_src];
|
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
|
static void upscale_f32_cuda(const float * x, float * dst,
|
||||||
const int scale_factor, cudaStream_t stream) {
|
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||||
int ne0 = (ne00 * scale_factor);
|
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||||
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||||
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
|
cudaStream_t stream) {
|
||||||
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
int dst_size = ne10 * ne11 * ne12 * ne13;
|
||||||
|
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||||
|
|
||||||
|
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
@ -39,10 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
|
||||||
|
|
||||||
const int scale_factor = dst->op_params[0];
|
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
||||||
|
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
||||||
|
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
||||||
|
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
||||||
|
|
||||||
upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
|
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
|
||||||
}
|
}
|
||||||
|
|
|
@ -120,9 +120,16 @@ extern "C" {
|
||||||
#ifndef __F16C__
|
#ifndef __F16C__
|
||||||
#define __F16C__
|
#define __F16C__
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||||
|
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||||
#ifndef __SSE3__
|
#ifndef __SSE3__
|
||||||
#define __SSE3__
|
#define __SSE3__
|
||||||
#endif
|
#endif
|
||||||
|
#ifndef __SSSE3__
|
||||||
|
#define __SSSE3__
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
|
|
|
@ -1559,12 +1559,18 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
float scale;
|
float scale;
|
||||||
memcpy(&scale, dst->op_params, sizeof(float));
|
float max_bias;
|
||||||
|
|
||||||
#pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
|
memcpy(&scale, (float *)dst->op_params + 0, sizeof(float));
|
||||||
|
memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
|
#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
|
||||||
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
||||||
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(src2 == nullptr);
|
|
||||||
|
#pragma message("TODO: add ALiBi support")
|
||||||
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
||||||
|
GGML_ASSERT(max_bias == 0.0f);
|
||||||
|
|
||||||
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
|
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
|
||||||
} break;
|
} break;
|
||||||
|
|
199
ggml-metal.m
199
ggml-metal.m
|
@ -40,6 +40,7 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_CLAMP,
|
GGML_METAL_KERNEL_TYPE_CLAMP,
|
||||||
GGML_METAL_KERNEL_TYPE_TANH,
|
GGML_METAL_KERNEL_TYPE_TANH,
|
||||||
GGML_METAL_KERNEL_TYPE_RELU,
|
GGML_METAL_KERNEL_TYPE_RELU,
|
||||||
|
GGML_METAL_KERNEL_TYPE_SIGMOID,
|
||||||
GGML_METAL_KERNEL_TYPE_GELU,
|
GGML_METAL_KERNEL_TYPE_GELU,
|
||||||
GGML_METAL_KERNEL_TYPE_GELU_4,
|
GGML_METAL_KERNEL_TYPE_GELU_4,
|
||||||
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
||||||
|
@ -169,7 +170,6 @@ enum ggml_metal_kernel_type {
|
||||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
|
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
||||||
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
|
|
||||||
GGML_METAL_KERNEL_TYPE_IM2COL_F16,
|
GGML_METAL_KERNEL_TYPE_IM2COL_F16,
|
||||||
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
||||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||||
|
@ -494,6 +494,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
||||||
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID, sigmoid, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
||||||
|
@ -623,7 +624,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||||
|
@ -633,14 +633,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
||||||
|
@ -732,6 +732,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
|
case GGML_UNARY_OP_SIGMOID:
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
|
@ -759,7 +760,6 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
return ctx->support_simdgroup_reduction;
|
return ctx->support_simdgroup_reduction;
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
return true;
|
return true;
|
||||||
|
@ -772,8 +772,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
case GGML_OP_FLASH_ATTN_EXT:
|
|
||||||
return true;
|
return true;
|
||||||
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
|
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
return ctx->support_simdgroup_reduction &&
|
return ctx->support_simdgroup_reduction &&
|
||||||
|
@ -1239,6 +1240,18 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
|
|
||||||
const int64_t n = ggml_nelements(dst);
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
|
} break;
|
||||||
|
case GGML_UNARY_OP_SIGMOID:
|
||||||
|
{
|
||||||
|
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline;
|
||||||
|
|
||||||
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
|
|
||||||
|
const int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
@ -1357,16 +1370,15 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
int nth = 32; // SIMD width
|
int nth = 32; // SIMD width
|
||||||
|
|
||||||
id<MTLComputePipelineState> pipeline = nil;
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||||
|
|
||||||
if (ne00%4 == 0) {
|
if (ne00%4 == 0) {
|
||||||
while (nth < ne00/4 && nth < 256) {
|
while (nth < ne00/4 && nth*ne01*ne02*ne03 < 256) {
|
||||||
nth *= 2;
|
nth *= 2;
|
||||||
}
|
}
|
||||||
if (use_f16) {
|
if (use_f16) {
|
||||||
|
@ -1375,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
|
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
while (nth < ne00 && nth < 1024) {
|
while (nth < ne00 && nth*ne01*ne02*ne03 < 256) {
|
||||||
nth *= 2;
|
nth *= 2;
|
||||||
}
|
}
|
||||||
if (use_f16) {
|
if (use_f16) {
|
||||||
|
@ -1394,8 +1406,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
const int64_t nrows_x = ggml_nrows(src0);
|
const int64_t nrows_x = ggml_nrows(src0);
|
||||||
const int64_t nrows_y = src0->ne[1];
|
const int64_t nrows_y = src0->ne[1];
|
||||||
|
|
||||||
const uint32_t n_head_kv = nrows_x/nrows_y;
|
const uint32_t n_head = nrows_x/nrows_y;
|
||||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||||
|
|
||||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
@ -1407,20 +1419,15 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
} else {
|
} else {
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
|
||||||
}
|
}
|
||||||
if (id_src2) {
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||||
} else {
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||||
}
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:6];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
|
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7];
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4];
|
[encoder setBytes:&m0 length:sizeof(m0) atIndex:8];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5];
|
[encoder setBytes:&m1 length:sizeof(m1) atIndex:9];
|
||||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6];
|
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
|
||||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:7];
|
|
||||||
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8];
|
|
||||||
[encoder setBytes:&m0 length:sizeof(m0) atIndex:9];
|
|
||||||
[encoder setBytes:&m1 length:sizeof(m1) atIndex:10];
|
|
||||||
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
|
|
||||||
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
|
@ -2225,49 +2232,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
{
|
|
||||||
GGML_ASSERT((src0t == GGML_TYPE_F32));
|
|
||||||
|
|
||||||
const int nth = MIN(1024, ne00);
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
||||||
|
|
||||||
float max_bias;
|
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
||||||
|
|
||||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
|
||||||
|
|
||||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
|
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
|
||||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
|
||||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
|
||||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
|
||||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
|
||||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
|
||||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
|
||||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
|
||||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
|
||||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
|
||||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
|
||||||
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
|
||||||
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
|
||||||
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
|
||||||
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
|
||||||
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
|
||||||
[encoder setBytes:&m1 length:sizeof( float) atIndex:19];
|
|
||||||
[encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20];
|
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
|
||||||
} break;
|
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne10 == ne02);
|
GGML_ASSERT(ne10 == ne02);
|
||||||
|
@ -2389,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int sf = dst->op_params[0];
|
const float sf0 = (float)ne0/src0->ne[0];
|
||||||
|
const float sf1 = (float)ne1/src0->ne[1];
|
||||||
|
const float sf2 = (float)ne2/src0->ne[2];
|
||||||
|
const float sf3 = (float)ne3/src0->ne[3];
|
||||||
|
|
||||||
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
|
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
|
||||||
|
|
||||||
|
@ -2412,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||||
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
|
||||||
|
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
|
||||||
|
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
|
||||||
|
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
|
||||||
|
|
||||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||||
|
|
||||||
|
@ -2549,12 +2519,13 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_OP_FLASH_ATTN_EXT:
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
|
GGML_ASSERT(ne11 % 32 == 0);
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
|
GGML_ASSERT(ggml_are_same_shape (src1, src2));
|
||||||
|
|
||||||
GGML_ASSERT(ggml_are_same_shape(src1, src2));
|
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
|
||||||
GGML_ASSERT(src3);
|
|
||||||
|
|
||||||
size_t offs_src3 = 0;
|
size_t offs_src3 = 0;
|
||||||
|
|
||||||
|
@ -2564,8 +2535,13 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
||||||
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
||||||
|
|
||||||
|
const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
|
||||||
|
const uint64_t nb21 = src2 ? src2->nb[1] : 0;
|
||||||
|
const uint64_t nb22 = src2 ? src2->nb[2] : 0;
|
||||||
|
const uint64_t nb23 = src2 ? src2->nb[3] : 0;
|
||||||
|
|
||||||
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
||||||
const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
||||||
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|
||||||
const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
|
const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
|
||||||
|
|
||||||
|
@ -2577,7 +2553,16 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
|
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
|
||||||
|
|
||||||
float scale;
|
float scale;
|
||||||
memcpy(&scale, dst->op_params, sizeof(float));
|
float max_bias;
|
||||||
|
|
||||||
|
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
|
||||||
|
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
|
||||||
|
|
||||||
|
const uint32_t n_head = src0->ne[2];
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
id<MTLComputePipelineState> pipeline = nil;
|
id<MTLComputePipelineState> pipeline = nil;
|
||||||
|
|
||||||
|
@ -2617,31 +2602,35 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
||||||
|
if (id_src3) {
|
||||||
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
|
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
|
||||||
|
} else {
|
||||||
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:3];
|
||||||
|
}
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
|
||||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
|
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:5];
|
||||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
|
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:6];
|
||||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
|
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:7];
|
||||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
|
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:8];
|
||||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
|
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:9];
|
||||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
|
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:10];
|
||||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
|
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:11];
|
||||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
|
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:12];
|
||||||
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
|
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:13];
|
||||||
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
|
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:14];
|
||||||
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
|
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:15];
|
||||||
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
|
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:16];
|
||||||
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
|
[encoder setBytes:&nb21 length:sizeof(uint64_t) atIndex:17];
|
||||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
|
[encoder setBytes:&nb22 length:sizeof(uint64_t) atIndex:18];
|
||||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
|
[encoder setBytes:&nb23 length:sizeof(uint64_t) atIndex:19];
|
||||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
|
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:20];
|
||||||
[encoder setBytes:&ne31 length:sizeof( int64_t) atIndex:21];
|
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:21];
|
||||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:22];
|
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:22];
|
||||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:23];
|
[encoder setBytes:&scale length:sizeof( float) atIndex:23];
|
||||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:24];
|
[encoder setBytes:&max_bias length:sizeof( float) atIndex:24];
|
||||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:25];
|
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
|
||||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26];
|
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
|
||||||
[encoder setBytes:&scale length:sizeof( float) atIndex:27];
|
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
|
||||||
|
|
||||||
if (!use_vec_kernel) {
|
if (!use_vec_kernel) {
|
||||||
// half8x8 kernel
|
// half8x8 kernel
|
||||||
|
|
189
ggml-metal.metal
189
ggml-metal.metal
|
@ -229,6 +229,13 @@ kernel void kernel_relu(
|
||||||
dst[tpig] = max(0.0f, src0[tpig]);
|
dst[tpig] = max(0.0f, src0[tpig]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void kernel_sigmoid(
|
||||||
|
device const float * src0,
|
||||||
|
device float * dst,
|
||||||
|
uint tpig[[thread_position_in_grid]]) {
|
||||||
|
dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig]));
|
||||||
|
}
|
||||||
|
|
||||||
kernel void kernel_tanh(
|
kernel void kernel_tanh(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
@ -356,7 +363,6 @@ template<typename T>
|
||||||
kernel void kernel_soft_max(
|
kernel void kernel_soft_max(
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device const char * src1,
|
device const char * src1,
|
||||||
device const char * src2,
|
|
||||||
device char * dst,
|
device char * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
|
@ -378,10 +384,9 @@ kernel void kernel_soft_max(
|
||||||
|
|
||||||
device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr;
|
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr;
|
||||||
device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
|
|
||||||
device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||||
|
|
||||||
float slope = 0.0f;
|
float slope = 1.0f;
|
||||||
|
|
||||||
// ALiBi
|
// ALiBi
|
||||||
if (max_bias > 0.0f) {
|
if (max_bias > 0.0f) {
|
||||||
|
@ -397,7 +402,7 @@ kernel void kernel_soft_max(
|
||||||
float lmax = -INFINITY;
|
float lmax = -INFINITY;
|
||||||
|
|
||||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
|
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the max value in the block
|
// find the max value in the block
|
||||||
|
@ -422,7 +427,7 @@ kernel void kernel_soft_max(
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float lsum = 0.0f;
|
float lsum = 0.0f;
|
||||||
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
||||||
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
|
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
|
||||||
lsum += exp_psrc0;
|
lsum += exp_psrc0;
|
||||||
pdst[i00] = exp_psrc0;
|
pdst[i00] = exp_psrc0;
|
||||||
}
|
}
|
||||||
|
@ -461,7 +466,6 @@ template<typename T>
|
||||||
kernel void kernel_soft_max_4(
|
kernel void kernel_soft_max_4(
|
||||||
device const char * src0,
|
device const char * src0,
|
||||||
device const char * src1,
|
device const char * src1,
|
||||||
device const char * src2,
|
|
||||||
device char * dst,
|
device char * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
|
@ -483,10 +487,9 @@ kernel void kernel_soft_max_4(
|
||||||
|
|
||||||
device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
|
device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
|
||||||
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr;
|
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr;
|
||||||
device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
|
|
||||||
device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
|
device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
|
||||||
|
|
||||||
float slope = 0.0f;
|
float slope = 1.0f;
|
||||||
|
|
||||||
if (max_bias > 0.0f) {
|
if (max_bias > 0.0f) {
|
||||||
const int64_t h = i02;
|
const int64_t h = i02;
|
||||||
|
@ -501,7 +504,7 @@ kernel void kernel_soft_max_4(
|
||||||
float4 lmax4 = -INFINITY;
|
float4 lmax4 = -INFINITY;
|
||||||
|
|
||||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)));
|
lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
|
||||||
}
|
}
|
||||||
|
|
||||||
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||||
|
@ -527,7 +530,7 @@ kernel void kernel_soft_max_4(
|
||||||
// parallel sum
|
// parallel sum
|
||||||
float4 lsum4 = 0.0f;
|
float4 lsum4 = 0.0f;
|
||||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||||
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))) - max_val);
|
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
|
||||||
lsum4 += exp_psrc4;
|
lsum4 += exp_psrc4;
|
||||||
pdst4[i00] = exp_psrc4;
|
pdst4[i00] = exp_psrc4;
|
||||||
}
|
}
|
||||||
|
@ -1595,60 +1598,6 @@ kernel void kernel_mul_mv_f16_f32_l4(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel void kernel_alibi_f32(
|
|
||||||
device const float * src0,
|
|
||||||
device float * dst,
|
|
||||||
constant int64_t & ne00,
|
|
||||||
constant int64_t & ne01,
|
|
||||||
constant int64_t & ne02,
|
|
||||||
constant int64_t & ne03,
|
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
|
||||||
constant uint64_t & nb02,
|
|
||||||
constant uint64_t & nb03,
|
|
||||||
constant int64_t & ne0,
|
|
||||||
constant int64_t & ne1,
|
|
||||||
constant int64_t & ne2,
|
|
||||||
constant int64_t & ne3,
|
|
||||||
constant uint64_t & nb0,
|
|
||||||
constant uint64_t & nb1,
|
|
||||||
constant uint64_t & nb2,
|
|
||||||
constant uint64_t & nb3,
|
|
||||||
constant float & m0,
|
|
||||||
constant float & m1,
|
|
||||||
constant int & n_heads_log2_floor,
|
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
|
||||||
const int64_t i03 = tgpig[2];
|
|
||||||
const int64_t i02 = tgpig[1];
|
|
||||||
const int64_t i01 = tgpig[0];
|
|
||||||
|
|
||||||
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
|
||||||
|
|
||||||
const int64_t i3 = n / (ne2*ne1*ne0);
|
|
||||||
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
|
|
||||||
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
|
|
||||||
//const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
|
|
||||||
|
|
||||||
const int64_t k = i3*ne3 + i2;
|
|
||||||
|
|
||||||
float m_k;
|
|
||||||
if (k < n_heads_log2_floor) {
|
|
||||||
m_k = pow(m0, k + 1);
|
|
||||||
} else {
|
|
||||||
m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
device char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1;
|
|
||||||
device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
|
||||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
|
||||||
const float src_v = *(device float *)(src_row + i00*nb00);
|
|
||||||
device float * dst_v = (device float *)(dst_row + i00*nb0);
|
|
||||||
*dst_v = i00 * m_k + src_v;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
||||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||||
return 1.0f - min(1.0f, max(0.0f, y));
|
return 1.0f - min(1.0f, max(0.0f, y));
|
||||||
|
@ -1903,7 +1852,10 @@ kernel void kernel_upscale_f32(
|
||||||
constant uint64_t & nb1,
|
constant uint64_t & nb1,
|
||||||
constant uint64_t & nb2,
|
constant uint64_t & nb2,
|
||||||
constant uint64_t & nb3,
|
constant uint64_t & nb3,
|
||||||
constant int32_t & sf,
|
constant float & sf0,
|
||||||
|
constant float & sf1,
|
||||||
|
constant float & sf2,
|
||||||
|
constant float & sf3,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
uint3 ntg[[threads_per_threadgroup]]) {
|
uint3 ntg[[threads_per_threadgroup]]) {
|
||||||
|
@ -1912,15 +1864,17 @@ kernel void kernel_upscale_f32(
|
||||||
const int64_t i2 = tgpig.y;
|
const int64_t i2 = tgpig.y;
|
||||||
const int64_t i1 = tgpig.x;
|
const int64_t i1 = tgpig.x;
|
||||||
|
|
||||||
const int64_t i03 = i3;
|
const int64_t i03 = i3/sf3;
|
||||||
const int64_t i02 = i2;
|
const int64_t i02 = i2/sf2;
|
||||||
const int64_t i01 = i1/sf;
|
const int64_t i01 = i1/sf1;
|
||||||
|
|
||||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
|
||||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
|
||||||
|
|
||||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||||
dst_ptr[i0] = src0_ptr[i0/sf];
|
const int64_t i00 = i0/sf0;
|
||||||
|
|
||||||
|
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||||
|
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
dst_ptr[0] = src0_ptr[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2100,29 +2054,29 @@ typedef void (flash_attn_ext_f16_t)(
|
||||||
device const char * v,
|
device const char * v,
|
||||||
device const char * mask,
|
device const char * mask,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
constant int64_t & ne03,
|
constant int64_t & ne03,
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
constant uint64_t & nb01,
|
||||||
constant uint64_t & nb02,
|
constant uint64_t & nb02,
|
||||||
constant uint64_t & nb03,
|
constant uint64_t & nb03,
|
||||||
constant int64_t & ne10,
|
|
||||||
constant int64_t & ne11,
|
constant int64_t & ne11,
|
||||||
constant int64_t & ne12,
|
constant int64_t & ne12,
|
||||||
constant int64_t & ne13,
|
constant int64_t & ne13,
|
||||||
constant uint64_t & nb10,
|
|
||||||
constant uint64_t & nb11,
|
constant uint64_t & nb11,
|
||||||
constant uint64_t & nb12,
|
constant uint64_t & nb12,
|
||||||
constant uint64_t & nb13,
|
constant uint64_t & nb13,
|
||||||
constant int64_t & ne31,
|
constant uint64_t & nb21,
|
||||||
|
constant uint64_t & nb22,
|
||||||
|
constant uint64_t & nb23,
|
||||||
constant uint64_t & nb31,
|
constant uint64_t & nb31,
|
||||||
constant int64_t & ne0,
|
|
||||||
constant int64_t & ne1,
|
constant int64_t & ne1,
|
||||||
constant int64_t & ne2,
|
constant int64_t & ne2,
|
||||||
constant int64_t & ne3,
|
|
||||||
constant float & scale,
|
constant float & scale,
|
||||||
|
constant float & max_bias,
|
||||||
|
constant float & m0,
|
||||||
|
constant float & m1,
|
||||||
|
constant uint32_t & n_head_log2,
|
||||||
threadgroup half * shared,
|
threadgroup half * shared,
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
@ -2138,29 +2092,29 @@ kernel void kernel_flash_attn_ext_f16(
|
||||||
device const char * v,
|
device const char * v,
|
||||||
device const char * mask,
|
device const char * mask,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
constant int64_t & ne03,
|
constant int64_t & ne03,
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
constant uint64_t & nb01,
|
||||||
constant uint64_t & nb02,
|
constant uint64_t & nb02,
|
||||||
constant uint64_t & nb03,
|
constant uint64_t & nb03,
|
||||||
constant int64_t & ne10,
|
|
||||||
constant int64_t & ne11,
|
constant int64_t & ne11,
|
||||||
constant int64_t & ne12,
|
constant int64_t & ne12,
|
||||||
constant int64_t & ne13,
|
constant int64_t & ne13,
|
||||||
constant uint64_t & nb10,
|
|
||||||
constant uint64_t & nb11,
|
constant uint64_t & nb11,
|
||||||
constant uint64_t & nb12,
|
constant uint64_t & nb12,
|
||||||
constant uint64_t & nb13,
|
constant uint64_t & nb13,
|
||||||
constant int64_t & ne31,
|
constant uint64_t & nb21,
|
||||||
|
constant uint64_t & nb22,
|
||||||
|
constant uint64_t & nb23,
|
||||||
constant uint64_t & nb31,
|
constant uint64_t & nb31,
|
||||||
constant int64_t & ne0,
|
|
||||||
constant int64_t & ne1,
|
constant int64_t & ne1,
|
||||||
constant int64_t & ne2,
|
constant int64_t & ne2,
|
||||||
constant int64_t & ne3,
|
|
||||||
constant float & scale,
|
constant float & scale,
|
||||||
|
constant float & max_bias,
|
||||||
|
constant float & m0,
|
||||||
|
constant float & m1,
|
||||||
|
constant uint32_t & n_head_log2,
|
||||||
threadgroup half * shared [[threadgroup(0)]],
|
threadgroup half * shared [[threadgroup(0)]],
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
@ -2225,10 +2179,6 @@ kernel void kernel_flash_attn_ext_f16(
|
||||||
const short ne22 = ne12;
|
const short ne22 = ne12;
|
||||||
const short ne23 = ne13;
|
const short ne23 = ne13;
|
||||||
|
|
||||||
const uint nb21 = nb11;
|
|
||||||
const uint nb22 = nb12;
|
|
||||||
const uint nb23 = nb13;
|
|
||||||
|
|
||||||
// broadcast
|
// broadcast
|
||||||
const short rk2 = ne02/ne12;
|
const short rk2 = ne02/ne12;
|
||||||
const short rk3 = ne03/ne13;
|
const short rk3 = ne03/ne13;
|
||||||
|
@ -2257,6 +2207,19 @@ kernel void kernel_flash_attn_ext_f16(
|
||||||
// prepare diagonal scale matrix
|
// prepare diagonal scale matrix
|
||||||
simdgroup_float8x8 mscale(scale);
|
simdgroup_float8x8 mscale(scale);
|
||||||
|
|
||||||
|
// prepare diagonal slope matrix
|
||||||
|
simdgroup_float8x8 mslope(1.0f);
|
||||||
|
|
||||||
|
// ALiBi
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const uint32_t h = iq2;
|
||||||
|
|
||||||
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
|
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
|
mslope = simdgroup_float8x8(pow(base, exph));
|
||||||
|
}
|
||||||
|
|
||||||
// loop over the KV cache
|
// loop over the KV cache
|
||||||
// each simdgroup handles blocks of Q rows and C columns
|
// each simdgroup handles blocks of Q rows and C columns
|
||||||
for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
|
for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
|
||||||
|
@ -2279,10 +2242,16 @@ kernel void kernel_flash_attn_ext_f16(
|
||||||
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
|
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
|
||||||
}
|
}
|
||||||
|
|
||||||
// mqk = mqk*scale + mask
|
if (mask != q) {
|
||||||
|
// mqk = mqk*scale + mask*slope
|
||||||
simdgroup_half8x8 mm;
|
simdgroup_half8x8 mm;
|
||||||
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
|
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
|
||||||
|
simdgroup_multiply(mm, mslope, mm);
|
||||||
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
|
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
|
||||||
|
} else {
|
||||||
|
// mqk = mqk*scale
|
||||||
|
simdgroup_multiply(mqk, mscale, mqk);
|
||||||
|
}
|
||||||
|
|
||||||
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
|
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
|
||||||
}
|
}
|
||||||
|
@ -2456,29 +2425,29 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
device const char * v,
|
device const char * v,
|
||||||
device const char * mask,
|
device const char * mask,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
constant int64_t & ne02,
|
constant int64_t & ne02,
|
||||||
constant int64_t & ne03,
|
constant int64_t & ne03,
|
||||||
constant uint64_t & nb00,
|
|
||||||
constant uint64_t & nb01,
|
constant uint64_t & nb01,
|
||||||
constant uint64_t & nb02,
|
constant uint64_t & nb02,
|
||||||
constant uint64_t & nb03,
|
constant uint64_t & nb03,
|
||||||
constant int64_t & ne10,
|
|
||||||
constant int64_t & ne11,
|
constant int64_t & ne11,
|
||||||
constant int64_t & ne12,
|
constant int64_t & ne12,
|
||||||
constant int64_t & ne13,
|
constant int64_t & ne13,
|
||||||
constant uint64_t & nb10,
|
|
||||||
constant uint64_t & nb11,
|
constant uint64_t & nb11,
|
||||||
constant uint64_t & nb12,
|
constant uint64_t & nb12,
|
||||||
constant uint64_t & nb13,
|
constant uint64_t & nb13,
|
||||||
constant int64_t & ne31,
|
constant uint64_t & nb21,
|
||||||
|
constant uint64_t & nb22,
|
||||||
|
constant uint64_t & nb23,
|
||||||
constant uint64_t & nb31,
|
constant uint64_t & nb31,
|
||||||
constant int64_t & ne0,
|
|
||||||
constant int64_t & ne1,
|
constant int64_t & ne1,
|
||||||
constant int64_t & ne2,
|
constant int64_t & ne2,
|
||||||
constant int64_t & ne3,
|
|
||||||
constant float & scale,
|
constant float & scale,
|
||||||
|
constant float & max_bias,
|
||||||
|
constant float & m0,
|
||||||
|
constant float & m1,
|
||||||
|
constant uint32_t & n_head_log2,
|
||||||
threadgroup half * shared [[threadgroup(0)]],
|
threadgroup half * shared [[threadgroup(0)]],
|
||||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||||
|
@ -2497,6 +2466,18 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
|
|
||||||
const short T = D + 2*nsg*SH; // shared memory size per query in (half)
|
const short T = D + 2*nsg*SH; // shared memory size per query in (half)
|
||||||
|
|
||||||
|
float slope = 1.0f;
|
||||||
|
|
||||||
|
// ALiBi
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
const uint32_t h = iq2;
|
||||||
|
|
||||||
|
const float base = h < n_head_log2 ? m0 : m1;
|
||||||
|
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||||
|
|
||||||
|
slope = pow(base, exp);
|
||||||
|
}
|
||||||
|
|
||||||
//threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
|
//threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
|
||||||
threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
|
threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
|
||||||
threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
|
threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
|
||||||
|
@ -2537,10 +2518,6 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
const short ne22 = ne12;
|
const short ne22 = ne12;
|
||||||
const short ne23 = ne13;
|
const short ne23 = ne13;
|
||||||
|
|
||||||
const uint nb21 = nb11;
|
|
||||||
const uint nb22 = nb12;
|
|
||||||
const uint nb23 = nb13;
|
|
||||||
|
|
||||||
// broadcast
|
// broadcast
|
||||||
const short rk2 = ne02/ne12;
|
const short rk2 = ne02/ne12;
|
||||||
const short rk3 = ne03/ne13;
|
const short rk3 = ne03/ne13;
|
||||||
|
@ -2603,10 +2580,9 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||||
mqk += simd_shuffle_down(mqk, 2);
|
mqk += simd_shuffle_down(mqk, 2);
|
||||||
mqk += simd_shuffle_down(mqk, 1);
|
mqk += simd_shuffle_down(mqk, 1);
|
||||||
|
|
||||||
// mqk = mqk*scale + mask
|
// mqk = mqk*scale + mask*slope
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
float4 mm = (float4) mp4[ic/4 + cc];
|
mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
|
||||||
mqk = mqk*scale + mm;
|
|
||||||
|
|
||||||
ss4[cc] = mqk;
|
ss4[cc] = mqk;
|
||||||
}
|
}
|
||||||
|
@ -2840,7 +2816,8 @@ kernel void kernel_cpy_f32_f16(
|
||||||
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
|
||||||
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||||
|
|
||||||
dst_data[i00] = src[0];
|
// TODO: is there a better way to handle -INFINITY?
|
||||||
|
dst_data[i00] = src[0] == -INFINITY ? -MAXHALF : src[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
2175
ggml-quants.c
2175
ggml-quants.c
File diff suppressed because it is too large
Load diff
1023
ggml-rpc.cpp
Normal file
1023
ggml-rpc.cpp
Normal file
File diff suppressed because it is too large
Load diff
24
ggml-rpc.h
Normal file
24
ggml-rpc.h
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GGML_RPC_MAX_SERVERS 16
|
||||||
|
|
||||||
|
// backend API
|
||||||
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||||
|
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
167
ggml-sycl.cpp
167
ggml-sycl.cpp
|
@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
||||||
#define SYCL_SCALE_BLOCK_SIZE 256
|
#define SYCL_SCALE_BLOCK_SIZE 256
|
||||||
#define SYCL_CLAMP_BLOCK_SIZE 256
|
#define SYCL_CLAMP_BLOCK_SIZE 256
|
||||||
#define SYCL_ROPE_BLOCK_SIZE 256
|
#define SYCL_ROPE_BLOCK_SIZE 256
|
||||||
#define SYCL_ALIBI_BLOCK_SIZE 32
|
|
||||||
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
|
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
|
||||||
#define SYCL_QUANTIZE_BLOCK_SIZE 256
|
#define SYCL_QUANTIZE_BLOCK_SIZE 256
|
||||||
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
|
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
|
||||||
|
@ -9316,32 +9315,6 @@ static void rope_glm_f32(
|
||||||
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
|
|
||||||
const int n_heads_log2_floor, const float m0, const float m1,
|
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
|
||||||
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
||||||
item_ct1.get_local_id(2);
|
|
||||||
|
|
||||||
if (col >= ncols) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
|
||||||
item_ct1.get_local_id(1);
|
|
||||||
const int i = row*ncols + col;
|
|
||||||
|
|
||||||
const int k = row/k_rows;
|
|
||||||
|
|
||||||
float m_k;
|
|
||||||
if (k < n_heads_log2_floor) {
|
|
||||||
m_k = dpct::pow(m0, k + 1);
|
|
||||||
} else {
|
|
||||||
m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
dst[i] = col * m_k + x[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
|
||||||
const sycl::nd_item<3> &item_ct1) {
|
const sycl::nd_item<3> &item_ct1) {
|
||||||
const int row = item_ct1.get_group(1);
|
const int row = item_ct1.get_group(1);
|
||||||
|
@ -9443,7 +9416,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
|
||||||
|
|
||||||
|
|
||||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||||
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
|
||||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||||
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
|
||||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||||
|
@ -9457,7 +9430,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
||||||
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
|
||||||
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
|
||||||
|
|
||||||
float slope = 0.0f;
|
float slope = 1.0f;
|
||||||
|
|
||||||
// ALiBi
|
// ALiBi
|
||||||
if (max_bias > 0.0f) {
|
if (max_bias > 0.0f) {
|
||||||
|
@ -9482,7 +9455,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
|
||||||
const int ix = rowx*ncols + col;
|
const int ix = rowx*ncols + col;
|
||||||
const int iy = rowy*ncols + col;
|
const int iy = rowy*ncols + col;
|
||||||
|
|
||||||
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
|
const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
|
||||||
|
|
||||||
vals[col] = val;
|
vals[col] = val;
|
||||||
max_val = sycl::max(max_val, val);
|
max_val = sycl::max(max_val, val);
|
||||||
|
@ -12964,20 +12937,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
|
|
||||||
const int nrows, const int k_rows,
|
|
||||||
const int n_heads_log2_floor, const float m0,
|
|
||||||
const float m1, dpct::queue_ptr stream) {
|
|
||||||
const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
|
|
||||||
const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
|
|
||||||
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
|
|
||||||
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
|
||||||
[=](sycl::nd_item<3> item_ct1) {
|
|
||||||
alibi_f32(x, dst, ncols, k_rows,
|
|
||||||
n_heads_log2_floor, m0, m1, item_ct1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
|
||||||
const int nrows, dpct::queue_ptr stream) {
|
const int nrows, dpct::queue_ptr stream) {
|
||||||
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
|
||||||
|
@ -13058,7 +13017,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
|
||||||
}
|
}
|
||||||
|
|
||||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||||
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
|
static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
|
||||||
const int nrows_y, const float scale, const float max_bias, const float m0,
|
const int nrows_y, const float scale, const float max_bias, const float m0,
|
||||||
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
|
||||||
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
const size_t n_local_scratch, dpct::queue_ptr stream) {
|
||||||
|
@ -13068,7 +13027,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
||||||
cgh.parallel_for(
|
cgh.parallel_for(
|
||||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||||
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
|
||||||
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
|
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
|
||||||
nrows_y, scale, max_bias, m0,
|
nrows_y, scale, max_bias, m0,
|
||||||
m1, n_head_log2, item_ct1,
|
m1, n_head_log2, item_ct1,
|
||||||
local_buf_acc.get_pointer());
|
local_buf_acc.get_pointer());
|
||||||
|
@ -13076,7 +13035,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
|
static void soft_max_f32_sycl(const float * x, const float * mask,
|
||||||
float * dst, const int ncols_x, const int nrows_x,
|
float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int nrows_y, const float scale, const float max_bias,
|
const int nrows_y, const float scale, const float max_bias,
|
||||||
dpct::queue_ptr stream) {
|
dpct::queue_ptr stream) {
|
||||||
|
@ -13098,60 +13057,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
|
||||||
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
|
||||||
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
if (n_local_scratch*sizeof(float) < local_mem_size) {
|
||||||
if (ncols_x > max_block_size) {
|
if (ncols_x > max_block_size) {
|
||||||
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
switch (ncols_x) {
|
switch (ncols_x) {
|
||||||
case 32:
|
case 32:
|
||||||
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
case 64:
|
case 64:
|
||||||
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
case 128:
|
case 128:
|
||||||
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
case 256:
|
case 256:
|
||||||
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
case 512:
|
case 512:
|
||||||
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
case 1024:
|
case 1024:
|
||||||
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
case 2048:
|
case 2048:
|
||||||
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
case 4096:
|
case 4096:
|
||||||
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, n_local_scratch, stream);
|
block_dims, n_local_scratch, stream);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
|
soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
|
||||||
max_bias, m0, m1, n_head_log2, block_nums,
|
max_bias, m0, m1, n_head_log2, block_nums,
|
||||||
block_dims, WARP_SIZE, stream);
|
block_dims, WARP_SIZE, stream);
|
||||||
}
|
}
|
||||||
|
@ -14028,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||||
|
|
||||||
|
#pragma message("TODO: generalize upscale operator")
|
||||||
|
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
|
||||||
|
GGML_ASSERT(false && "TODO: generalize upscale operator");
|
||||||
|
|
||||||
const int scale_factor = dst->op_params[0];
|
const int scale_factor = dst->op_params[0];
|
||||||
|
|
||||||
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
||||||
|
@ -14562,36 +14525,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
||||||
(void) src1_dd;
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
||||||
ggml_tensor *dst, const float *src0_dd,
|
|
||||||
const float *src1_dd, float *dst_dd,
|
|
||||||
const dpct::queue_ptr &main_stream) {
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
||||||
float max_bias;
|
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
||||||
|
|
||||||
//GGML_ASSERT(ne01 + n_past == ne00);
|
|
||||||
GGML_ASSERT(n_head == ne02);
|
|
||||||
|
|
||||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
||||||
|
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
|
||||||
|
|
||||||
alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
|
|
||||||
|
|
||||||
(void) src1;
|
|
||||||
(void) src1_dd;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
|
static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
|
||||||
const ggml_tensor *src1, ggml_tensor *dst,
|
const ggml_tensor *src1, ggml_tensor *dst,
|
||||||
const float *src0_dd, const float *src1_dd,
|
const float *src0_dd, const float *src1_dd,
|
||||||
|
@ -14746,12 +14679,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const ggml_tensor * src2 = dst->src[2];
|
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
|
||||||
|
|
||||||
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
|
|
||||||
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
||||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||||
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t nrows_x = ggml_nrows(src0);
|
const int64_t nrows_x = ggml_nrows(src0);
|
||||||
|
@ -14763,25 +14693,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
|
||||||
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
memcpy(&scale, dst->op_params + 0, sizeof(float));
|
||||||
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
// positions tensor
|
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
|
||||||
float * src2_dd = nullptr;
|
|
||||||
sycl_pool_alloc<float> src2_f;
|
|
||||||
|
|
||||||
const bool use_src2 = src2 != nullptr;
|
|
||||||
|
|
||||||
if (use_src2) {
|
|
||||||
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
|
||||||
|
|
||||||
if (src2_on_device) {
|
|
||||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
|
||||||
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
|
||||||
} else {
|
|
||||||
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
|
||||||
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
|
|
||||||
nrows_x, nrows_y, scale, max_bias, main_stream);
|
nrows_x, nrows_y, scale, max_bias, main_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15656,26 +15568,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
const int64_t r2 = ne12/ne02;
|
const int64_t r2 = ne12/ne02;
|
||||||
const int64_t r3 = ne13/ne03;
|
const int64_t r3 = ne13/ne03;
|
||||||
|
|
||||||
#if 0
|
|
||||||
// use syclGemmEx
|
|
||||||
{
|
|
||||||
for (int i13 = 0; i13 < ne13; ++i13) {
|
|
||||||
for (int i12 = 0; i12 < ne12; ++i12) {
|
|
||||||
int i03 = i13 / r3;
|
|
||||||
int i02 = i12 / r2;
|
|
||||||
|
|
||||||
SYCL_CHECK(
|
|
||||||
syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
|
|
||||||
ne01, ne11, ne10,
|
|
||||||
alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half),
|
|
||||||
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float),
|
|
||||||
beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01,
|
|
||||||
cu_compute_type,
|
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||||
|
@ -15687,7 +15579,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
nb11 / nb10, nb12 / nb10, beta,
|
nb11 / nb10, nb12 / nb10, beta,
|
||||||
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
(char *)dst_t, cu_data_type, ne01, nb2 / nb0,
|
||||||
ne12 * ne13, cu_compute_type)));
|
ne12 * ne13, cu_compute_type)));
|
||||||
g_sycl_handles[g_main_device]->wait();
|
|
||||||
} else {
|
} else {
|
||||||
const int ne23 = ne12*ne13;
|
const int ne23 = ne12*ne13;
|
||||||
|
|
||||||
|
@ -15718,7 +15609,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
nb02, nb03, nb12_scaled, nb13_scaled,
|
nb02, nb03, nb12_scaled, nb13_scaled,
|
||||||
nbd2, nbd3, r2, r3, item_ct1);
|
nbd2, nbd3, r2, r3, item_ct1);
|
||||||
});
|
});
|
||||||
}).wait();
|
});
|
||||||
}
|
}
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||||
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
||||||
|
@ -15729,9 +15620,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
dpct::library_data_t::real_half, nb11 / nb10, beta,
|
||||||
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
|
||||||
cu_compute_type)));
|
cu_compute_type)));
|
||||||
g_sycl_handles[g_main_device]->wait();
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
if (no_mixed_dtypes) {
|
if (no_mixed_dtypes) {
|
||||||
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
|
||||||
|
@ -16232,10 +16121,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
|
||||||
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
||||||
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
|
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
|
||||||
}
|
}
|
||||||
|
@ -16612,9 +16497,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
func = ggml_sycl_rope;
|
func = ggml_sycl_rope;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
func = ggml_sycl_alibi;
|
|
||||||
break;
|
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
func = ggml_sycl_im2col;
|
func = ggml_sycl_im2col;
|
||||||
break;
|
break;
|
||||||
|
@ -17744,7 +17626,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
|
|
|
@ -3830,9 +3830,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
return nullptr;
|
return nullptr;
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
|
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32 || src2->type == GGML_TYPE_F16);
|
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && (src2 == nullptr || src2->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
|
||||||
return ctx->device->pipeline_soft_max_f32;
|
return ctx->device->pipeline_soft_max_f32;
|
||||||
}
|
}
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && src2->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && src2->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||||
|
@ -4286,6 +4285,9 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
|
||||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
|
#pragma message("TODO: src2 is no longer used in soft_max - should be removed and ALiBi calculation should be updated")
|
||||||
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
|
||||||
|
|
||||||
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
||||||
ncols,
|
ncols,
|
||||||
src1 != nullptr ? nrows_y : (uint32_t)0,
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
||||||
|
|
469
ggml.c
469
ggml.c
|
@ -4,7 +4,6 @@
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "sgemm.h"
|
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
|
@ -37,6 +36,10 @@
|
||||||
#undef GGML_USE_LLAMAFILE
|
#undef GGML_USE_LLAMAFILE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_LLAMAFILE
|
||||||
|
#include "sgemm.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
// disable "possible loss of data" to avoid hundreds of casts
|
// disable "possible loss of data" to avoid hundreds of casts
|
||||||
// we should just be careful :)
|
// we should just be careful :)
|
||||||
|
@ -1303,6 +1306,8 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||||
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
||||||
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
||||||
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
||||||
|
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
||||||
|
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
||||||
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
||||||
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
||||||
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
||||||
|
@ -1949,6 +1954,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
||||||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||||
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
||||||
|
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
||||||
// TODO: optimize performance
|
// TODO: optimize performance
|
||||||
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||||
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||||
|
@ -2185,7 +2191,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"SOFT_MAX_BACK",
|
"SOFT_MAX_BACK",
|
||||||
"ROPE",
|
"ROPE",
|
||||||
"ROPE_BACK",
|
"ROPE_BACK",
|
||||||
"ALIBI",
|
|
||||||
"CLAMP",
|
"CLAMP",
|
||||||
"CONV_TRANSPOSE_1D",
|
"CONV_TRANSPOSE_1D",
|
||||||
"IM2COL",
|
"IM2COL",
|
||||||
|
@ -2227,7 +2232,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"CROSS_ENTROPY_LOSS_BACK",
|
"CROSS_ENTROPY_LOSS_BACK",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
|
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -2276,7 +2281,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"soft_max_back(x)",
|
"soft_max_back(x)",
|
||||||
"rope(x)",
|
"rope(x)",
|
||||||
"rope_back(x)",
|
"rope_back(x)",
|
||||||
"alibi(x)",
|
|
||||||
"clamp(x)",
|
"clamp(x)",
|
||||||
"conv_transpose_1d(x)",
|
"conv_transpose_1d(x)",
|
||||||
"im2col(x)",
|
"im2col(x)",
|
||||||
|
@ -2318,7 +2322,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"cross_entropy_loss_back(x,y)",
|
"cross_entropy_loss_back(x,y)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
|
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
||||||
|
|
||||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
|
@ -2331,6 +2335,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
||||||
"TANH",
|
"TANH",
|
||||||
"ELU",
|
"ELU",
|
||||||
"RELU",
|
"RELU",
|
||||||
|
"SIGMOID",
|
||||||
"GELU",
|
"GELU",
|
||||||
"GELU_QUICK",
|
"GELU_QUICK",
|
||||||
"SILU",
|
"SILU",
|
||||||
|
@ -2338,7 +2343,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
||||||
"HARDSIGMOID",
|
"HARDSIGMOID",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
|
static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
|
||||||
|
|
||||||
|
|
||||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||||
|
@ -2819,6 +2824,16 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
||||||
(t0->ne[3] == t1->ne[3] );
|
(t0->ne[3] == t1->ne[3] );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||||
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
return
|
||||||
|
(t0->nb[0] == t1->nb[0] ) &&
|
||||||
|
(t0->nb[1] == t1->nb[1] ) &&
|
||||||
|
(t0->nb[2] == t1->nb[2] ) &&
|
||||||
|
(t0->nb[3] == t1->nb[3] );
|
||||||
|
}
|
||||||
|
|
||||||
// check if t1 can be represented as a repeatition of t0
|
// check if t1 can be represented as a repeatition of t0
|
||||||
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
@ -3163,6 +3178,12 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
|
|
||||||
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
|
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
|
||||||
|
|
||||||
|
#ifdef __clang__
|
||||||
|
// temporary until ggml_tensor::backend is removed
|
||||||
|
#pragma clang diagnostic push
|
||||||
|
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||||
|
#endif
|
||||||
|
|
||||||
*result = (struct ggml_tensor) {
|
*result = (struct ggml_tensor) {
|
||||||
/*.type =*/ type,
|
/*.type =*/ type,
|
||||||
/*.backend =*/ GGML_BACKEND_TYPE_CPU,
|
/*.backend =*/ GGML_BACKEND_TYPE_CPU,
|
||||||
|
@ -3185,6 +3206,10 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
/*.padding =*/ { 0 },
|
/*.padding =*/ { 0 },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef __clang__
|
||||||
|
#pragma clang diagnostic pop
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
// TODO: this should not be needed as long as we don't rely on aligned SIMD loads
|
||||||
//ggml_assert_aligned(result->data);
|
//ggml_assert_aligned(result->data);
|
||||||
|
|
||||||
|
@ -4563,6 +4588,20 @@ struct ggml_tensor * ggml_leaky_relu(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_sigmoid
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_sigmoid(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a) {
|
||||||
|
return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_sigmoid_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a) {
|
||||||
|
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_gelu
|
// ggml_gelu
|
||||||
|
|
||||||
struct ggml_tensor * ggml_gelu(
|
struct ggml_tensor * ggml_gelu(
|
||||||
|
@ -5646,7 +5685,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
struct ggml_tensor * pos,
|
|
||||||
float scale,
|
float scale,
|
||||||
float max_bias,
|
float max_bias,
|
||||||
bool inplace) {
|
bool inplace) {
|
||||||
|
@ -5660,18 +5698,8 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pos) {
|
|
||||||
GGML_ASSERT(ggml_is_vector(pos));
|
|
||||||
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT(pos->ne[0] == a->ne[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (pos && mask) {
|
|
||||||
GGML_ASSERT(pos->type == mask->type);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (max_bias > 0.0f) {
|
if (max_bias > 0.0f) {
|
||||||
GGML_ASSERT(pos);
|
GGML_ASSERT(mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
@ -5689,7 +5717,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
result->src[1] = mask;
|
result->src[1] = mask;
|
||||||
result->src[2] = pos;
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -5697,23 +5724,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||||
struct ggml_tensor * ggml_soft_max(
|
struct ggml_tensor * ggml_soft_max(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a) {
|
||||||
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
|
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_soft_max_inplace(
|
struct ggml_tensor * ggml_soft_max_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a) {
|
struct ggml_tensor * a) {
|
||||||
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
|
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_soft_max_ext(
|
struct ggml_tensor * ggml_soft_max_ext(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
struct ggml_tensor * pos,
|
|
||||||
float scale,
|
float scale,
|
||||||
float max_bias) {
|
float max_bias) {
|
||||||
return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
|
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_soft_max_back
|
// ggml_soft_max_back
|
||||||
|
@ -5928,37 +5954,6 @@ struct ggml_tensor * ggml_rope_back(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_alibi
|
|
||||||
|
|
||||||
struct ggml_tensor * ggml_alibi(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * a,
|
|
||||||
int n_past,
|
|
||||||
int n_head,
|
|
||||||
float bias_max) {
|
|
||||||
GGML_ASSERT(n_past >= 0);
|
|
||||||
bool is_node = false;
|
|
||||||
|
|
||||||
if (a->grad) {
|
|
||||||
GGML_ASSERT(false); // TODO: implement backward
|
|
||||||
is_node = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: when implement backward, fix this:
|
|
||||||
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
||||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
|
||||||
|
|
||||||
int32_t op_params[3] = { n_past, n_head };
|
|
||||||
memcpy(op_params + 2, &bias_max, sizeof(float));
|
|
||||||
ggml_set_op_params(result, op_params, sizeof(op_params));
|
|
||||||
|
|
||||||
result->op = GGML_OP_ALIBI;
|
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
||||||
result->src[0] = a;
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ggml_clamp
|
// ggml_clamp
|
||||||
|
|
||||||
struct ggml_tensor * ggml_clamp(
|
struct ggml_tensor * ggml_clamp(
|
||||||
|
@ -6308,7 +6303,10 @@ struct ggml_tensor * ggml_pool_2d(
|
||||||
static struct ggml_tensor * ggml_upscale_impl(
|
static struct ggml_tensor * ggml_upscale_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int scale_factor) {
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int ne2,
|
||||||
|
int ne3) {
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (a->grad) {
|
if (a->grad) {
|
||||||
|
@ -6316,19 +6314,45 @@ static struct ggml_tensor * ggml_upscale_impl(
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(a->ne[0] <= ne0);
|
||||||
|
GGML_ASSERT(a->ne[1] <= ne1);
|
||||||
|
GGML_ASSERT(a->ne[2] <= ne2);
|
||||||
|
GGML_ASSERT(a->ne[3] <= ne3);
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||||
a->ne[0] * scale_factor,
|
ne0,
|
||||||
a->ne[1] * scale_factor,
|
ne1,
|
||||||
a->ne[2], a->ne[3]);
|
ne2,
|
||||||
|
ne3
|
||||||
|
);
|
||||||
|
|
||||||
result->op = GGML_OP_UPSCALE;
|
result->op = GGML_OP_UPSCALE;
|
||||||
result->op_params[0] = scale_factor;
|
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_upscale(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int scale_factor) {
|
||||||
|
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_upscale_ext(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int ne2,
|
||||||
|
int ne3) {
|
||||||
|
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_pad
|
||||||
|
|
||||||
struct ggml_tensor * ggml_pad(
|
struct ggml_tensor * ggml_pad(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -6353,12 +6377,7 @@ struct ggml_tensor * ggml_pad(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_upscale(
|
// ggml_arange
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * a,
|
|
||||||
int scale_factor) {
|
|
||||||
return ggml_upscale_impl(ctx, a, scale_factor);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * ggml_arange(
|
struct ggml_tensor * ggml_arange(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -6380,6 +6399,8 @@ struct ggml_tensor * ggml_arange(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_timestep_embedding
|
||||||
|
|
||||||
struct ggml_tensor * ggml_timestep_embedding(
|
struct ggml_tensor * ggml_timestep_embedding(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * timesteps,
|
struct ggml_tensor * timesteps,
|
||||||
|
@ -6486,9 +6507,11 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
||||||
struct ggml_tensor * k,
|
struct ggml_tensor * k,
|
||||||
struct ggml_tensor * v,
|
struct ggml_tensor * v,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
float scale) {
|
float scale,
|
||||||
|
float max_bias) {
|
||||||
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
||||||
// TODO: check if vT can be multiplied by (k*qT)
|
// TODO: check if vT can be multiplied by (k*qT)
|
||||||
|
|
||||||
if (mask) {
|
if (mask) {
|
||||||
GGML_ASSERT(ggml_is_contiguous(mask));
|
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||||
GGML_ASSERT(mask->ne[2] == 1);
|
GGML_ASSERT(mask->ne[2] == 1);
|
||||||
|
@ -6498,6 +6521,10 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
||||||
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (max_bias > 0.0f) {
|
||||||
|
GGML_ASSERT(mask);
|
||||||
|
}
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (q->grad || k->grad || v->grad) {
|
if (q->grad || k->grad || v->grad) {
|
||||||
|
@ -6508,7 +6535,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
||||||
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||||
|
|
||||||
float params[] = { scale };
|
float params[] = { scale, max_bias };
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_FLASH_ATTN_EXT;
|
result->op = GGML_OP_FLASH_ATTN_EXT;
|
||||||
|
@ -6528,7 +6555,7 @@ void ggml_flash_attn_ext_set_prec(
|
||||||
|
|
||||||
const int32_t prec_i32 = (int32_t) prec;
|
const int32_t prec_i32 = (int32_t) prec;
|
||||||
|
|
||||||
ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
|
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_flash_ff
|
// ggml_flash_ff
|
||||||
|
@ -10892,6 +10919,52 @@ static void ggml_compute_forward_relu(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_sigmoid
|
||||||
|
|
||||||
|
static void ggml_compute_forward_sigmoid_f32(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
assert(params->ith == 0);
|
||||||
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n = ggml_nrows(src0);
|
||||||
|
const int nc = src0->ne[0];
|
||||||
|
|
||||||
|
assert(dst->nb[0] == sizeof(float));
|
||||||
|
assert(src0->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
ggml_vec_sigmoid_f32(nc,
|
||||||
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||||
|
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_compute_forward_sigmoid(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_F32:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_sigmoid_f32(params, dst);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
} break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_gelu
|
// ggml_compute_forward_gelu
|
||||||
|
|
||||||
static void ggml_compute_forward_gelu_f32(
|
static void ggml_compute_forward_gelu_f32(
|
||||||
|
@ -13333,7 +13406,6 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
const struct ggml_tensor * src1 = dst->src[1];
|
const struct ggml_tensor * src1 = dst->src[1];
|
||||||
const struct ggml_tensor * src2 = dst->src[2];
|
|
||||||
|
|
||||||
assert(ggml_is_contiguous(dst));
|
assert(ggml_is_contiguous(dst));
|
||||||
assert(ggml_are_same_shape(src0, dst));
|
assert(ggml_are_same_shape(src0, dst));
|
||||||
|
@ -13359,8 +13431,8 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
|
|
||||||
// TODO: is this supposed to be ceil instead of floor?
|
// TODO: is this supposed to be ceil instead of floor?
|
||||||
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
|
||||||
const uint32_t n_head_kv = ne02;
|
const uint32_t n_head = ne02;
|
||||||
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
|
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||||
|
|
||||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
@ -13377,13 +13449,13 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
|
|
||||||
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
|
||||||
|
|
||||||
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
|
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
||||||
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
|
|
||||||
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
|
|
||||||
|
|
||||||
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
|
|
||||||
|
|
||||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||||
|
// ALiBi
|
||||||
|
const uint32_t h = (i1/ne01)%ne02; // head
|
||||||
|
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
||||||
|
|
||||||
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||||
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||||
|
|
||||||
|
@ -13396,27 +13468,11 @@ static void ggml_compute_forward_soft_max_f32(
|
||||||
if (mp_f32) {
|
if (mp_f32) {
|
||||||
if (use_f16) {
|
if (use_f16) {
|
||||||
for (int i = 0; i < nc; ++i) {
|
for (int i = 0; i < nc; ++i) {
|
||||||
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
|
wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < nc; ++i) {
|
for (int i = 0; i < nc; ++i) {
|
||||||
wp[i] += mp_f32[i];
|
wp[i] += slope*mp_f32[i];
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ALiBi bias
|
|
||||||
if (max_bias > 0.0f) {
|
|
||||||
const uint32_t h = (i1/ne01)%ne02; // head
|
|
||||||
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
|
|
||||||
|
|
||||||
if (use_f16) {
|
|
||||||
for (int i = 0; i < nc; ++i) {
|
|
||||||
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int i = 0; i < nc; ++i) {
|
|
||||||
wp[i] += slope*pos_f32[i];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13578,178 +13634,6 @@ static void ggml_compute_forward_soft_max_back(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_alibi
|
|
||||||
|
|
||||||
static void ggml_compute_forward_alibi_f32(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
|
||||||
|
|
||||||
assert(params->ith == 0);
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
||||||
float max_bias;
|
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
||||||
|
|
||||||
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
|
||||||
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
|
||||||
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
|
||||||
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
|
||||||
|
|
||||||
const int64_t n = ggml_nrows(src0);
|
|
||||||
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
|
||||||
|
|
||||||
const size_t nb0 = src0->nb[0];
|
|
||||||
const size_t nb1 = src0->nb[1];
|
|
||||||
const size_t nb2 = src0->nb[2];
|
|
||||||
//const int nb3 = src0->nb[3];
|
|
||||||
|
|
||||||
GGML_ASSERT(nb0 == sizeof(float));
|
|
||||||
GGML_ASSERT(n_head == ne2);
|
|
||||||
|
|
||||||
// add alibi to src0 (KQ_scaled)
|
|
||||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
||||||
|
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
|
||||||
|
|
||||||
for (int64_t k = 0; k < ne2_ne3; k++) {
|
|
||||||
// TODO: k*nb2 or k*nb3
|
|
||||||
float m_k;
|
|
||||||
|
|
||||||
if (k < n_heads_log2_floor) {
|
|
||||||
m_k = powf(m0, k + 1);
|
|
||||||
} else {
|
|
||||||
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int64_t i = 0; i < ne0; i++) {
|
|
||||||
for (int64_t j = 0; j < ne1; j++) {
|
|
||||||
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
|
||||||
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
|
||||||
pdst[0] = i * m_k + src[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_alibi_f16(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
|
||||||
|
|
||||||
assert(params->ith == 0);
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_head = ((int32_t *) dst->op_params)[1];
|
|
||||||
float max_bias;
|
|
||||||
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
||||||
|
|
||||||
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
|
||||||
const int ne1 = src0->ne[1]; // seq_len_without_past
|
|
||||||
const int ne2 = src0->ne[2]; // n_head -> this is k
|
|
||||||
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
|
||||||
|
|
||||||
const int n = ggml_nrows(src0);
|
|
||||||
const int ne2_ne3 = n/ne1; // ne2*ne3
|
|
||||||
|
|
||||||
const int nb0 = src0->nb[0];
|
|
||||||
const int nb1 = src0->nb[1];
|
|
||||||
const int nb2 = src0->nb[2];
|
|
||||||
//const int nb3 = src0->nb[3];
|
|
||||||
|
|
||||||
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
|
|
||||||
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
|
|
||||||
GGML_ASSERT(n_head == ne2);
|
|
||||||
|
|
||||||
// add alibi to src0 (KQ_scaled)
|
|
||||||
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
||||||
|
|
||||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
|
||||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
|
||||||
|
|
||||||
for (int k = 0; k < ne2_ne3; k++) {
|
|
||||||
// TODO: k*nb2 or k*nb3
|
|
||||||
float m_k;
|
|
||||||
|
|
||||||
if (k < n_heads_log2_floor) {
|
|
||||||
m_k = powf(m0, k + 1);
|
|
||||||
} else {
|
|
||||||
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < ne0; i++) {
|
|
||||||
for (int j = 0; j < ne1; j++) {
|
|
||||||
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
|
||||||
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
|
||||||
|
|
||||||
// we return F32
|
|
||||||
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_alibi(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
|
||||||
|
|
||||||
switch (src0->type) {
|
|
||||||
case GGML_TYPE_F16:
|
|
||||||
{
|
|
||||||
ggml_compute_forward_alibi_f16(params, dst);
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_F32:
|
|
||||||
{
|
|
||||||
ggml_compute_forward_alibi_f32(params, dst);
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_BF16:
|
|
||||||
case GGML_TYPE_Q4_0:
|
|
||||||
case GGML_TYPE_Q4_1:
|
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
case GGML_TYPE_Q5_1:
|
|
||||||
case GGML_TYPE_Q8_0:
|
|
||||||
case GGML_TYPE_Q8_1:
|
|
||||||
case GGML_TYPE_Q2_K:
|
|
||||||
case GGML_TYPE_Q3_K:
|
|
||||||
case GGML_TYPE_Q4_K:
|
|
||||||
case GGML_TYPE_Q5_K:
|
|
||||||
case GGML_TYPE_Q6_K:
|
|
||||||
case GGML_TYPE_IQ2_XXS:
|
|
||||||
case GGML_TYPE_IQ2_XS:
|
|
||||||
case GGML_TYPE_IQ3_XXS:
|
|
||||||
case GGML_TYPE_IQ1_S:
|
|
||||||
case GGML_TYPE_IQ1_M:
|
|
||||||
case GGML_TYPE_IQ4_NL:
|
|
||||||
case GGML_TYPE_IQ4_XS:
|
|
||||||
case GGML_TYPE_IQ3_S:
|
|
||||||
case GGML_TYPE_IQ2_S:
|
|
||||||
case GGML_TYPE_Q8_K:
|
|
||||||
case GGML_TYPE_I8:
|
|
||||||
case GGML_TYPE_I16:
|
|
||||||
case GGML_TYPE_I32:
|
|
||||||
case GGML_TYPE_I64:
|
|
||||||
case GGML_TYPE_F64:
|
|
||||||
case GGML_TYPE_COUNT:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ggml_compute_forward_clamp
|
// ggml_compute_forward_clamp
|
||||||
|
|
||||||
static void ggml_compute_forward_clamp_f32(
|
static void ggml_compute_forward_clamp_f32(
|
||||||
|
@ -14972,25 +14856,28 @@ static void ggml_compute_forward_upscale_f32(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
const int scale_factor = dst->op_params[0];
|
const float sf0 = (float)ne0/src0->ne[0];
|
||||||
|
const float sf1 = (float)ne1/src0->ne[1];
|
||||||
|
const float sf2 = (float)ne2/src0->ne[2];
|
||||||
|
const float sf3 = (float)ne3/src0->ne[3];
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
const int64_t i03 = i3;
|
const int64_t i03 = i3 / sf3;
|
||||||
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
|
||||||
const int64_t i02 = i2;
|
const int64_t i02 = i2 / sf2;
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
||||||
const int64_t i01 = i1 / scale_factor;
|
const int64_t i01 = i1 / sf1;
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
for (int64_t i0 = 0; i0 < ne0; i0++) {
|
||||||
const int64_t i00 = i0 / scale_factor;
|
const int64_t i00 = i0 / sf0;
|
||||||
|
|
||||||
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||||
|
@ -15020,6 +14907,7 @@ static void ggml_compute_forward_upscale(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ggml_compute_forward_pad
|
// ggml_compute_forward_pad
|
||||||
|
|
||||||
static void ggml_compute_forward_pad_f32(
|
static void ggml_compute_forward_pad_f32(
|
||||||
|
@ -15764,7 +15652,16 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||||
const int ir1 = MIN(ir0 + dr, nr);
|
const int ir1 = MIN(ir0 + dr, nr);
|
||||||
|
|
||||||
float scale = 1.0f;
|
float scale = 1.0f;
|
||||||
|
float max_bias = 0.0f;
|
||||||
|
|
||||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||||
|
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
|
const uint32_t n_head = neq2;
|
||||||
|
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
||||||
|
|
||||||
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||||
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||||
|
|
||||||
// loop over n_batch and n_head
|
// loop over n_batch and n_head
|
||||||
for (int ir = ir0; ir < ir1; ++ir) {
|
for (int ir = ir0; ir < ir1; ++ir) {
|
||||||
|
@ -15773,6 +15670,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||||
|
|
||||||
|
const uint32_t h = iq2; // head
|
||||||
|
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
|
||||||
|
|
||||||
float S = 0.0f;
|
float S = 0.0f;
|
||||||
float M = -INFINITY;
|
float M = -INFINITY;
|
||||||
|
|
||||||
|
@ -15796,7 +15696,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||||
// loop over n_kv and n_head_kv
|
// loop over n_kv and n_head_kv
|
||||||
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
||||||
for (int64_t ic = 0; ic < nek1; ++ic) {
|
for (int64_t ic = 0; ic < nek1; ++ic) {
|
||||||
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
||||||
if (mv == -INFINITY) {
|
if (mv == -INFINITY) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -15867,7 +15767,7 @@ static void ggml_compute_forward_flash_attn_ext(
|
||||||
const struct ggml_tensor * v,
|
const struct ggml_tensor * v,
|
||||||
const struct ggml_tensor * mask,
|
const struct ggml_tensor * mask,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
switch (dst->op_params[1]) {
|
switch (dst->op_params[2]) {
|
||||||
case GGML_PREC_DEFAULT:
|
case GGML_PREC_DEFAULT:
|
||||||
case GGML_PREC_F32:
|
case GGML_PREC_F32:
|
||||||
{
|
{
|
||||||
|
@ -16834,6 +16734,10 @@ static void ggml_compute_forward_unary(
|
||||||
{
|
{
|
||||||
ggml_compute_forward_relu(params, dst);
|
ggml_compute_forward_relu(params, dst);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_UNARY_OP_SIGMOID:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_sigmoid(params, dst);
|
||||||
|
} break;
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_gelu(params, dst);
|
ggml_compute_forward_gelu(params, dst);
|
||||||
|
@ -17630,10 +17534,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_back(params, tensor);
|
ggml_compute_forward_rope_back(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
{
|
|
||||||
ggml_compute_forward_alibi(params, tensor);
|
|
||||||
} break;
|
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_clamp(params, tensor);
|
ggml_compute_forward_clamp(params, tensor);
|
||||||
|
@ -18652,10 +18552,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
zero_table);
|
zero_table);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
{
|
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
|
||||||
} break;
|
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
@ -18826,6 +18722,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
zero_table);
|
zero_table);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_UNARY_OP_SIGMOID:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
} break;
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
@ -19355,6 +19255,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
case GGML_UNARY_OP_ELU:
|
case GGML_UNARY_OP_ELU:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
|
case GGML_UNARY_OP_SIGMOID:
|
||||||
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
||||||
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
||||||
{
|
{
|
||||||
|
@ -19428,10 +19329,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
||||||
{
|
{
|
||||||
n_tasks = n_threads;
|
n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_ALIBI:
|
|
||||||
{
|
|
||||||
n_tasks = 1; //TODO
|
|
||||||
} break;
|
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
{
|
{
|
||||||
n_tasks = 1; //TODO
|
n_tasks = 1; //TODO
|
||||||
|
|
45
ggml.h
45
ggml.h
|
@ -468,7 +468,6 @@ extern "C" {
|
||||||
GGML_OP_SOFT_MAX_BACK,
|
GGML_OP_SOFT_MAX_BACK,
|
||||||
GGML_OP_ROPE,
|
GGML_OP_ROPE,
|
||||||
GGML_OP_ROPE_BACK,
|
GGML_OP_ROPE_BACK,
|
||||||
GGML_OP_ALIBI,
|
|
||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_TRANSPOSE_1D,
|
GGML_OP_CONV_TRANSPOSE_1D,
|
||||||
GGML_OP_IM2COL,
|
GGML_OP_IM2COL,
|
||||||
|
@ -520,6 +519,7 @@ extern "C" {
|
||||||
GGML_UNARY_OP_TANH,
|
GGML_UNARY_OP_TANH,
|
||||||
GGML_UNARY_OP_ELU,
|
GGML_UNARY_OP_ELU,
|
||||||
GGML_UNARY_OP_RELU,
|
GGML_UNARY_OP_RELU,
|
||||||
|
GGML_UNARY_OP_SIGMOID,
|
||||||
GGML_UNARY_OP_GELU,
|
GGML_UNARY_OP_GELU,
|
||||||
GGML_UNARY_OP_GELU_QUICK,
|
GGML_UNARY_OP_GELU_QUICK,
|
||||||
GGML_UNARY_OP_SILU,
|
GGML_UNARY_OP_SILU,
|
||||||
|
@ -565,7 +565,8 @@ extern "C" {
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
enum ggml_backend_type backend;
|
|
||||||
|
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||||
|
|
||||||
struct ggml_backend_buffer * buffer;
|
struct ggml_backend_buffer * buffer;
|
||||||
|
|
||||||
|
@ -766,7 +767,8 @@ extern "C" {
|
||||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||||
|
|
||||||
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
|
|
||||||
// use this to compute the memory overhead of a tensor
|
// use this to compute the memory overhead of a tensor
|
||||||
GGML_API size_t ggml_tensor_overhead(void);
|
GGML_API size_t ggml_tensor_overhead(void);
|
||||||
|
@ -1074,6 +1076,14 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_sigmoid(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_gelu(
|
GGML_API struct ggml_tensor * ggml_gelu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
@ -1428,15 +1438,13 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
|
// fused soft_max(a*scale + mask*(ALiBi slope))
|
||||||
// mask is optional
|
// mask is optional
|
||||||
// pos is required when max_bias > 0.0f
|
|
||||||
// max_bias = 0.0f for no ALiBi
|
// max_bias = 0.0f for no ALiBi
|
||||||
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
GGML_API struct ggml_tensor * ggml_soft_max_ext(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
struct ggml_tensor * pos,
|
|
||||||
float scale,
|
float scale,
|
||||||
float max_bias);
|
float max_bias);
|
||||||
|
|
||||||
|
@ -1538,16 +1546,6 @@ extern "C" {
|
||||||
float xpos_base,
|
float xpos_base,
|
||||||
bool xpos_down);
|
bool xpos_down);
|
||||||
|
|
||||||
// alibi position embedding
|
|
||||||
// in-place, returns view(a)
|
|
||||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * a,
|
|
||||||
int n_past,
|
|
||||||
int n_head,
|
|
||||||
float bias_max),
|
|
||||||
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
|
|
||||||
|
|
||||||
// clamp
|
// clamp
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_clamp(
|
GGML_API struct ggml_tensor * ggml_clamp(
|
||||||
|
@ -1677,12 +1675,24 @@ extern "C" {
|
||||||
float p1);
|
float p1);
|
||||||
|
|
||||||
// nearest interpolate
|
// nearest interpolate
|
||||||
|
// multiplies ne0 and ne1 by scale factor
|
||||||
// used in stable-diffusion
|
// used in stable-diffusion
|
||||||
GGML_API struct ggml_tensor * ggml_upscale(
|
GGML_API struct ggml_tensor * ggml_upscale(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int scale_factor);
|
int scale_factor);
|
||||||
|
|
||||||
|
// nearest interpolate
|
||||||
|
// nearest interpolate to specified dimensions
|
||||||
|
// used in tortoise.cpp
|
||||||
|
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int ne0,
|
||||||
|
int ne1,
|
||||||
|
int ne2,
|
||||||
|
int ne3);
|
||||||
|
|
||||||
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||||
GGML_API struct ggml_tensor * ggml_pad(
|
GGML_API struct ggml_tensor * ggml_pad(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -1744,7 +1754,8 @@ extern "C" {
|
||||||
struct ggml_tensor * k,
|
struct ggml_tensor * k,
|
||||||
struct ggml_tensor * v,
|
struct ggml_tensor * v,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
float scale);
|
float scale,
|
||||||
|
float max_bias);
|
||||||
|
|
||||||
GGML_API void ggml_flash_attn_ext_set_prec(
|
GGML_API void ggml_flash_attn_ext_set_prec(
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from .constants import *
|
from .constants import *
|
||||||
|
from .lazy import *
|
||||||
from .gguf_reader import *
|
from .gguf_reader import *
|
||||||
from .gguf_writer import *
|
from .gguf_writer import *
|
||||||
|
from .quants import *
|
||||||
from .tensor_mapping import *
|
from .tensor_mapping import *
|
||||||
from .vocab import *
|
from .vocab import *
|
||||||
|
|
|
@ -10,6 +10,7 @@ from typing import Any
|
||||||
GGUF_MAGIC = 0x46554747 # "GGUF"
|
GGUF_MAGIC = 0x46554747 # "GGUF"
|
||||||
GGUF_VERSION = 3
|
GGUF_VERSION = 3
|
||||||
GGUF_DEFAULT_ALIGNMENT = 32
|
GGUF_DEFAULT_ALIGNMENT = 32
|
||||||
|
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
||||||
|
|
||||||
#
|
#
|
||||||
# metadata keys
|
# metadata keys
|
||||||
|
@ -118,6 +119,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
|
JINA_BERT_V2 = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
|
@ -195,6 +197,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
|
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
|
@ -380,6 +383,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.JINA_BERT_V2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.MPT: [
|
MODEL_ARCH.MPT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -820,6 +839,49 @@ class GGMLQuantizationType(IntEnum):
|
||||||
BF16 = 30
|
BF16 = 30
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||||
|
|
||||||
|
|
||||||
|
# from llama_ftype in llama.h
|
||||||
|
# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
|
||||||
|
class LlamaFileType(IntEnum):
|
||||||
|
ALL_F32 = 0
|
||||||
|
MOSTLY_F16 = 1 # except 1d tensors
|
||||||
|
MOSTLY_Q4_0 = 2 # except 1d tensors
|
||||||
|
MOSTLY_Q4_1 = 3 # except 1d tensors
|
||||||
|
MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
||||||
|
# MOSTLY_Q4_2 = 5 # support has been removed
|
||||||
|
# MOSTLY_Q4_3 = 6 # support has been removed
|
||||||
|
MOSTLY_Q8_0 = 7 # except 1d tensors
|
||||||
|
MOSTLY_Q5_0 = 8 # except 1d tensors
|
||||||
|
MOSTLY_Q5_1 = 9 # except 1d tensors
|
||||||
|
MOSTLY_Q2_K = 10 # except 1d tensors
|
||||||
|
MOSTLY_Q3_K_S = 11 # except 1d tensors
|
||||||
|
MOSTLY_Q3_K_M = 12 # except 1d tensors
|
||||||
|
MOSTLY_Q3_K_L = 13 # except 1d tensors
|
||||||
|
MOSTLY_Q4_K_S = 14 # except 1d tensors
|
||||||
|
MOSTLY_Q4_K_M = 15 # except 1d tensors
|
||||||
|
MOSTLY_Q5_K_S = 16 # except 1d tensors
|
||||||
|
MOSTLY_Q5_K_M = 17 # except 1d tensors
|
||||||
|
MOSTLY_Q6_K = 18 # except 1d tensors
|
||||||
|
MOSTLY_IQ2_XXS = 19 # except 1d tensors
|
||||||
|
MOSTLY_IQ2_XS = 20 # except 1d tensors
|
||||||
|
MOSTLY_Q2_K_S = 21 # except 1d tensors
|
||||||
|
MOSTLY_IQ3_XS = 22 # except 1d tensors
|
||||||
|
MOSTLY_IQ3_XXS = 23 # except 1d tensors
|
||||||
|
MOSTLY_IQ1_S = 24 # except 1d tensors
|
||||||
|
MOSTLY_IQ4_NL = 25 # except 1d tensors
|
||||||
|
MOSTLY_IQ3_S = 26 # except 1d tensors
|
||||||
|
MOSTLY_IQ3_M = 27 # except 1d tensors
|
||||||
|
MOSTLY_IQ2_S = 28 # except 1d tensors
|
||||||
|
MOSTLY_IQ2_M = 29 # except 1d tensors
|
||||||
|
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
||||||
|
MOSTLY_IQ1_M = 31 # except 1d tensors
|
||||||
|
MOSTLY_BF16 = 32 # except 1d tensors
|
||||||
|
|
||||||
|
GUESSED = 1024 # not specified in the model file
|
||||||
|
|
||||||
|
|
||||||
class GGUFEndian(IntEnum):
|
class GGUFEndian(IntEnum):
|
||||||
LITTLE = 0
|
LITTLE = 0
|
||||||
BIG = 1
|
BIG = 1
|
||||||
|
|
|
@ -7,12 +7,13 @@ import struct
|
||||||
import tempfile
|
import tempfile
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from io import BufferedWriter
|
from io import BufferedWriter
|
||||||
from typing import IO, Any, Callable, Sequence, Mapping
|
from typing import IO, Any, Sequence, Mapping
|
||||||
from string import ascii_letters, digits
|
from string import ascii_letters, digits
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .constants import (
|
from .constants import (
|
||||||
|
GGML_QUANT_SIZES,
|
||||||
GGUF_DEFAULT_ALIGNMENT,
|
GGUF_DEFAULT_ALIGNMENT,
|
||||||
GGUF_MAGIC,
|
GGUF_MAGIC,
|
||||||
GGUF_VERSION,
|
GGUF_VERSION,
|
||||||
|
@ -28,47 +29,6 @@ from .constants import (
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class LazyTensor:
|
|
||||||
data: Callable[[], np.ndarray[Any, Any]]
|
|
||||||
# to avoid too deep recursion
|
|
||||||
functions: list[Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]]]
|
|
||||||
dtype: np.dtype[Any]
|
|
||||||
shape: tuple[int, ...]
|
|
||||||
|
|
||||||
def __init__(self, data: Callable[[], np.ndarray[Any, Any]], *, dtype: type, shape: tuple[int, ...]):
|
|
||||||
self.data = data
|
|
||||||
self.functions = []
|
|
||||||
self.dtype = np.dtype(dtype)
|
|
||||||
self.shape = shape
|
|
||||||
|
|
||||||
def astype(self, dtype: type, **kwargs) -> LazyTensor:
|
|
||||||
self.functions.append(lambda n: n.astype(dtype, **kwargs))
|
|
||||||
self.dtype = np.dtype(dtype)
|
|
||||||
return self
|
|
||||||
|
|
||||||
@property
|
|
||||||
def nbytes(self) -> int:
|
|
||||||
size = 1
|
|
||||||
for n in self.shape:
|
|
||||||
size *= n
|
|
||||||
return size * self.dtype.itemsize
|
|
||||||
|
|
||||||
def tofile(self, *args, **kwargs) -> None:
|
|
||||||
data = self.data()
|
|
||||||
for f in self.functions:
|
|
||||||
data = f(data)
|
|
||||||
assert data.shape == self.shape
|
|
||||||
assert data.dtype == self.dtype
|
|
||||||
assert data.nbytes == self.nbytes
|
|
||||||
self.functions = []
|
|
||||||
self.data = lambda: data
|
|
||||||
data.tofile(*args, **kwargs)
|
|
||||||
|
|
||||||
def byteswap(self, *args, **kwargs) -> LazyTensor:
|
|
||||||
self.functions.append(lambda n: n.byteswap(*args, **kwargs))
|
|
||||||
return self
|
|
||||||
|
|
||||||
|
|
||||||
class WriterState(Enum):
|
class WriterState(Enum):
|
||||||
EMPTY = auto()
|
EMPTY = auto()
|
||||||
HEADER = auto()
|
HEADER = auto()
|
||||||
|
@ -79,7 +39,7 @@ class WriterState(Enum):
|
||||||
class GGUFWriter:
|
class GGUFWriter:
|
||||||
fout: BufferedWriter
|
fout: BufferedWriter
|
||||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
||||||
tensors: list[np.ndarray[Any, Any] | LazyTensor]
|
tensors: list[np.ndarray[Any, Any]]
|
||||||
_simple_value_packing = {
|
_simple_value_packing = {
|
||||||
GGUFValueType.UINT8: "B",
|
GGUFValueType.UINT8: "B",
|
||||||
GGUFValueType.INT8: "b",
|
GGUFValueType.INT8: "b",
|
||||||
|
@ -236,7 +196,7 @@ class GGUFWriter:
|
||||||
return ((x + n - 1) // n) * n
|
return ((x + n - 1) // n) * n
|
||||||
|
|
||||||
def add_tensor_info(
|
def add_tensor_info(
|
||||||
self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32],
|
self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
|
||||||
tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
|
tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if self.state is not WriterState.EMPTY:
|
if self.state is not WriterState.EMPTY:
|
||||||
|
@ -249,10 +209,6 @@ class GGUFWriter:
|
||||||
encoded_name = name.encode("utf-8")
|
encoded_name = name.encode("utf-8")
|
||||||
self.ti_data += self._pack("Q", len(encoded_name))
|
self.ti_data += self._pack("Q", len(encoded_name))
|
||||||
self.ti_data += encoded_name
|
self.ti_data += encoded_name
|
||||||
n_dims = len(tensor_shape)
|
|
||||||
self.ti_data += self._pack("I", n_dims)
|
|
||||||
for i in range(n_dims):
|
|
||||||
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
|
||||||
if raw_dtype is None:
|
if raw_dtype is None:
|
||||||
if tensor_dtype == np.float16:
|
if tensor_dtype == np.float16:
|
||||||
dtype = GGMLQuantizationType.F16
|
dtype = GGMLQuantizationType.F16
|
||||||
|
@ -272,13 +228,22 @@ class GGUFWriter:
|
||||||
raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
|
raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
|
||||||
else:
|
else:
|
||||||
dtype = raw_dtype
|
dtype = raw_dtype
|
||||||
|
if tensor_dtype == np.uint8:
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
|
||||||
|
if tensor_shape[-1] % type_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
|
||||||
|
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
|
||||||
|
n_dims = len(tensor_shape)
|
||||||
|
self.ti_data += self._pack("I", n_dims)
|
||||||
|
for i in range(n_dims):
|
||||||
|
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
||||||
self.ti_data += self._pack("I", dtype)
|
self.ti_data += self._pack("I", dtype)
|
||||||
self.ti_data += self._pack("Q", self.offset_tensor)
|
self.ti_data += self._pack("Q", self.offset_tensor)
|
||||||
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
||||||
self.ti_data_count += 1
|
self.ti_data_count += 1
|
||||||
|
|
||||||
def add_tensor(
|
def add_tensor(
|
||||||
self, name: str, tensor: np.ndarray[Any, Any] | LazyTensor, raw_shape: Sequence[int] | None = None,
|
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
||||||
raw_dtype: GGMLQuantizationType | None = None,
|
raw_dtype: GGMLQuantizationType | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
|
@ -303,7 +268,7 @@ class GGUFWriter:
|
||||||
if pad != 0:
|
if pad != 0:
|
||||||
fp.write(bytes([0] * pad))
|
fp.write(bytes([0] * pad))
|
||||||
|
|
||||||
def write_tensor_data(self, tensor: np.ndarray[Any, Any] | LazyTensor) -> None:
|
def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
|
||||||
if self.state is not WriterState.TI_DATA:
|
if self.state is not WriterState.TI_DATA:
|
||||||
raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
|
raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
|
||||||
|
|
||||||
|
@ -391,7 +356,7 @@ class GGUFWriter:
|
||||||
def add_name(self, name: str) -> None:
|
def add_name(self, name: str) -> None:
|
||||||
self.add_string(Keys.General.NAME, name)
|
self.add_string(Keys.General.NAME, name)
|
||||||
|
|
||||||
def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
|
def add_quantization_version(self, quantization_version: int) -> None:
|
||||||
self.add_uint32(
|
self.add_uint32(
|
||||||
Keys.General.QUANTIZATION_VERSION, quantization_version)
|
Keys.General.QUANTIZATION_VERSION, quantization_version)
|
||||||
|
|
||||||
|
|
236
gguf-py/gguf/lazy.py
Normal file
236
gguf-py/gguf/lazy.py
Normal file
|
@ -0,0 +1,236 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
from abc import ABC, ABCMeta, abstractmethod
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any, Callable
|
||||||
|
from collections import deque
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy._typing import _Shape
|
||||||
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LazyMeta(ABCMeta):
|
||||||
|
|
||||||
|
def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
|
||||||
|
def __getattr__(self, __name: str) -> Any:
|
||||||
|
meta_attr = getattr(self._meta, __name)
|
||||||
|
if callable(meta_attr):
|
||||||
|
return type(self)._wrap_fn(
|
||||||
|
(lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)),
|
||||||
|
use_self=self,
|
||||||
|
)
|
||||||
|
elif isinstance(meta_attr, self._tensor_type):
|
||||||
|
# e.g. self.T with torch.Tensor should still be wrapped
|
||||||
|
return type(self)._wrap_fn(lambda s: getattr(s, __name))(self)
|
||||||
|
else:
|
||||||
|
# no need to wrap non-tensor properties,
|
||||||
|
# and they likely don't depend on the actual contents of the tensor
|
||||||
|
return meta_attr
|
||||||
|
|
||||||
|
namespace["__getattr__"] = __getattr__
|
||||||
|
|
||||||
|
# need to make a builder for the wrapped wrapper to copy the name,
|
||||||
|
# or else it fails with very cryptic error messages,
|
||||||
|
# because somehow the same string would end up in every closures
|
||||||
|
def mk_wrap(op_name: str, *, meta_noop: bool = False):
|
||||||
|
# need to wrap the wrapper to get self
|
||||||
|
def wrapped_special_op(self, *args, **kwargs):
|
||||||
|
return type(self)._wrap_fn(
|
||||||
|
getattr(type(self)._tensor_type, op_name),
|
||||||
|
meta_noop=meta_noop,
|
||||||
|
)(self, *args, **kwargs)
|
||||||
|
return wrapped_special_op
|
||||||
|
|
||||||
|
# special methods bypass __getattr__, so they need to be added manually
|
||||||
|
# ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
|
||||||
|
# NOTE: doing this from a metaclass is very convenient
|
||||||
|
# TODO: make this even more comprehensive
|
||||||
|
for binary_op in (
|
||||||
|
"lt", "le", "eq", "ne", "ge", "gt", "not"
|
||||||
|
"abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
|
||||||
|
"neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
|
||||||
|
"iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
|
||||||
|
"radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
|
||||||
|
):
|
||||||
|
attr_name = f"__{binary_op}__"
|
||||||
|
# the result of these operators usually has the same shape and dtype as the input,
|
||||||
|
# so evaluation on the meta tensor can be skipped.
|
||||||
|
namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
|
||||||
|
|
||||||
|
for special_op in (
|
||||||
|
"getitem", "setitem", "len",
|
||||||
|
):
|
||||||
|
attr_name = f"__{special_op}__"
|
||||||
|
namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
|
||||||
|
|
||||||
|
return super().__new__(cls, name, bases, namespace, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
# Tree of lazy tensors
|
||||||
|
class LazyBase(ABC, metaclass=LazyMeta):
|
||||||
|
_tensor_type: type
|
||||||
|
_meta: Any
|
||||||
|
_data: Any | None
|
||||||
|
_lazy: deque[LazyBase] # shared within a graph, to avoid deep recursion when making eager
|
||||||
|
_args: tuple
|
||||||
|
_func: Callable[[tuple], Any] | None
|
||||||
|
|
||||||
|
def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None):
|
||||||
|
super().__init__()
|
||||||
|
self._meta = meta
|
||||||
|
self._data = data
|
||||||
|
self._lazy = lazy if lazy is not None else deque()
|
||||||
|
self._args = args
|
||||||
|
self._func = func
|
||||||
|
assert self._func is not None or self._data is not None
|
||||||
|
if self._data is None:
|
||||||
|
self._lazy.append(self)
|
||||||
|
|
||||||
|
def __init_subclass__(cls) -> None:
|
||||||
|
if "_tensor_type" not in cls.__dict__:
|
||||||
|
raise TypeError(f"property '_tensor_type' must be defined for {cls!r}")
|
||||||
|
return super().__init_subclass__()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
|
||||||
|
# TODO: dict and set
|
||||||
|
if isinstance(o, (list, tuple)):
|
||||||
|
L = []
|
||||||
|
for item in o:
|
||||||
|
L.append(LazyBase._recurse_apply(item, fn))
|
||||||
|
if isinstance(o, tuple):
|
||||||
|
L = tuple(L)
|
||||||
|
return L
|
||||||
|
elif isinstance(o, LazyBase):
|
||||||
|
return fn(o)
|
||||||
|
else:
|
||||||
|
return o
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
|
||||||
|
def wrapped_fn(*args, **kwargs):
|
||||||
|
if kwargs is None:
|
||||||
|
kwargs = {}
|
||||||
|
args = ((use_self,) if use_self is not None else ()) + args
|
||||||
|
|
||||||
|
meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
|
||||||
|
|
||||||
|
if isinstance(meta_noop, bool) and not meta_noop:
|
||||||
|
try:
|
||||||
|
res = fn(*meta_args, **kwargs)
|
||||||
|
except NotImplementedError:
|
||||||
|
# running some operations on PyTorch's Meta tensors can cause this exception
|
||||||
|
res = None
|
||||||
|
else:
|
||||||
|
# some operators don't need to actually run on the meta tensors
|
||||||
|
assert len(args) > 0
|
||||||
|
res = args[0]
|
||||||
|
assert isinstance(res, cls)
|
||||||
|
res = res._meta
|
||||||
|
# allow operations to override the dtype and shape
|
||||||
|
if meta_noop is not True:
|
||||||
|
if isinstance(meta_noop, tuple):
|
||||||
|
dtype, shape = meta_noop
|
||||||
|
assert callable(shape)
|
||||||
|
res = cls.meta_with_dtype_and_shape(dtype, shape(res.shape))
|
||||||
|
else:
|
||||||
|
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
|
||||||
|
|
||||||
|
if isinstance(res, cls._tensor_type):
|
||||||
|
def collect_replace(t: LazyBase):
|
||||||
|
if collect_replace.shared_lazy is None:
|
||||||
|
collect_replace.shared_lazy = t._lazy
|
||||||
|
else:
|
||||||
|
collect_replace.shared_lazy.extend(t._lazy)
|
||||||
|
t._lazy = collect_replace.shared_lazy
|
||||||
|
|
||||||
|
# emulating a static variable
|
||||||
|
collect_replace.shared_lazy = None
|
||||||
|
|
||||||
|
LazyBase._recurse_apply(args, collect_replace)
|
||||||
|
|
||||||
|
shared_lazy = collect_replace.shared_lazy
|
||||||
|
|
||||||
|
return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
|
||||||
|
else:
|
||||||
|
del res # not needed
|
||||||
|
# non-tensor return likely relies on the contents of the args
|
||||||
|
# (e.g. the result of torch.equal)
|
||||||
|
eager_args = cls.to_eager(args)
|
||||||
|
return fn(*eager_args, **kwargs)
|
||||||
|
return wrapped_fn
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def to_eager(cls, t: Any) -> Any:
|
||||||
|
def simple_to_eager(_t: LazyBase) -> Any:
|
||||||
|
def already_eager_to_eager(_t: LazyBase) -> Any:
|
||||||
|
assert _t._data is not None
|
||||||
|
return _t._data
|
||||||
|
|
||||||
|
while _t._data is None:
|
||||||
|
lt = _t._lazy.popleft()
|
||||||
|
if lt._data is not None:
|
||||||
|
# Lazy tensor did not belong in the lazy queue.
|
||||||
|
# Weirdly only happens with Bloom models...
|
||||||
|
# likely because tensors aren't unique in the queue.
|
||||||
|
# The final output is still the same as in eager mode,
|
||||||
|
# so it's safe to ignore this.
|
||||||
|
continue
|
||||||
|
assert lt._func is not None
|
||||||
|
lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
|
||||||
|
lt._data = lt._func(lt._args)
|
||||||
|
# sanity check
|
||||||
|
assert lt._data.dtype == lt._meta.dtype
|
||||||
|
assert lt._data.shape == lt._meta.shape
|
||||||
|
|
||||||
|
return _t._data
|
||||||
|
|
||||||
|
# recurse into lists and/or tuples, keeping their structure
|
||||||
|
return cls._recurse_apply(t, simple_to_eager)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def eager_to_meta(cls, t: Any) -> Any:
|
||||||
|
return cls.meta_with_dtype_and_shape(t.dtype, t.shape)
|
||||||
|
|
||||||
|
# must be overridden, meta tensor init is backend-specific
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def meta_with_dtype_and_shape(cls, dtype: Any, shape: Any) -> Any: pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_eager(cls, t: Any) -> Any:
|
||||||
|
if type(t) is cls:
|
||||||
|
# already eager
|
||||||
|
return t
|
||||||
|
elif isinstance(t, cls._tensor_type):
|
||||||
|
return cls(meta=cls.eager_to_meta(t), data=t)
|
||||||
|
else:
|
||||||
|
return TypeError(f"{type(t)!r} is not compatible with {cls._tensor_type!r}")
|
||||||
|
|
||||||
|
|
||||||
|
class LazyNumpyTensor(LazyBase):
|
||||||
|
_tensor_type = np.ndarray
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
|
||||||
|
# The initial idea was to use np.nan as the fill value,
|
||||||
|
# but non-float types like np.int16 can't use that.
|
||||||
|
# So zero it is.
|
||||||
|
cheat = np.zeros(1, dtype)
|
||||||
|
return np.lib.stride_tricks.as_strided(cheat, shape, (0 for _ in shape))
|
||||||
|
|
||||||
|
def astype(self, dtype, *args, **kwargs):
|
||||||
|
meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
|
||||||
|
full_args = (self, dtype,) + args
|
||||||
|
# very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
|
||||||
|
return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
|
||||||
|
|
||||||
|
def tofile(self, *args, **kwargs):
|
||||||
|
eager = LazyNumpyTensor.to_eager(self)
|
||||||
|
return eager.tofile(*args, **kwargs)
|
||||||
|
|
||||||
|
# TODO: __array_function__
|
109
gguf-py/gguf/quants.py
Normal file
109
gguf-py/gguf/quants.py
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
|
from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
|
||||||
|
from .lazy import LazyNumpyTensor
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||||
|
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||||
|
n = n.astype(np.float32, copy=False).view(np.int32)
|
||||||
|
# force nan to quiet
|
||||||
|
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
|
||||||
|
# flush subnormals to zero
|
||||||
|
n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
|
||||||
|
# round to nearest even
|
||||||
|
n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||||
|
return n.astype(np.int16)
|
||||||
|
|
||||||
|
|
||||||
|
# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
|
||||||
|
def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
|
||||||
|
rows = arr.reshape((-1, arr.shape[-1]))
|
||||||
|
osize = 1
|
||||||
|
for dim in oshape:
|
||||||
|
osize *= dim
|
||||||
|
out = np.empty(shape=osize, dtype=otype)
|
||||||
|
# compute over groups of 16 rows (arbitrary, but seems good for performance)
|
||||||
|
n_groups = rows.shape[0] // 16
|
||||||
|
np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
|
||||||
|
return out.reshape(oshape)
|
||||||
|
|
||||||
|
|
||||||
|
def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
|
||||||
|
return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.int16, oshape=n.shape)
|
||||||
|
|
||||||
|
|
||||||
|
__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.int16)
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_bf16(n: np.ndarray):
|
||||||
|
if type(n) is LazyNumpyTensor:
|
||||||
|
return __quantize_bf16_lazy(n)
|
||||||
|
else:
|
||||||
|
return __quantize_bf16_array(n)
|
||||||
|
|
||||||
|
|
||||||
|
__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
|
||||||
|
|
||||||
|
|
||||||
|
def can_quantize_to_q8_0(n: np.ndarray) -> bool:
|
||||||
|
return n.shape[-1] % __q8_block_size == 0
|
||||||
|
|
||||||
|
|
||||||
|
# round away from zero
|
||||||
|
# ref: https://stackoverflow.com/a/59143326/22827863
|
||||||
|
def np_roundf(n: np.ndarray) -> np.ndarray:
|
||||||
|
a = abs(n)
|
||||||
|
floored = np.floor(a)
|
||||||
|
b = floored + np.floor(2 * (a - floored))
|
||||||
|
return np.sign(n) * b
|
||||||
|
|
||||||
|
|
||||||
|
def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
|
||||||
|
return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
|
||||||
|
|
||||||
|
|
||||||
|
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
|
||||||
|
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
|
||||||
|
shape = n.shape
|
||||||
|
assert shape[-1] % __q8_block_size == 0
|
||||||
|
|
||||||
|
n_blocks = n.size // __q8_block_size
|
||||||
|
|
||||||
|
blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
|
||||||
|
|
||||||
|
d = abs(blocks).max(axis=1, keepdims=True) / 127
|
||||||
|
with np.errstate(divide="ignore"):
|
||||||
|
id = np.where(d == 0, 0, 1 / d)
|
||||||
|
qs = np_roundf(blocks * id)
|
||||||
|
|
||||||
|
# (n_blocks, 2)
|
||||||
|
d = d.astype(np.float16).view(np.uint8)
|
||||||
|
# (n_blocks, block_size)
|
||||||
|
qs = qs.astype(np.int8).view(np.uint8)
|
||||||
|
|
||||||
|
assert d.shape[1] + qs.shape[1] == __q8_type_size
|
||||||
|
|
||||||
|
return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
|
||||||
|
|
||||||
|
|
||||||
|
def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
|
||||||
|
return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
|
||||||
|
|
||||||
|
|
||||||
|
__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
|
||||||
|
__quantize_q8_0_array,
|
||||||
|
meta_noop=(np.uint8, __quantize_q8_0_shape_change),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def quantize_q8_0(data: np.ndarray):
|
||||||
|
if type(data) is LazyNumpyTensor:
|
||||||
|
return __quantize_q8_0_lazy(data)
|
||||||
|
else:
|
||||||
|
return __quantize_q8_0_array(data)
|
|
@ -137,6 +137,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.attention.wk", # llama-pth
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.key", # bert
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
|
"transformer.h.{bid}.attn.k", # refact
|
||||||
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wk", # internlm2
|
"model.layers.{bid}.attention.wk", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
|
||||||
|
@ -148,6 +149,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.attention.wv", # llama-pth
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.value", # bert
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
|
"transformer.h.{bid}.attn.v", # refact
|
||||||
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wv", # internlm2
|
"model.layers.{bid}.attention.wv", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
||||||
|
@ -229,6 +231,7 @@ class TensorNameMap:
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
|
"transformer.h.{bid}.mlp.linear_3", # refact
|
||||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
"transformer.h.{bid}.mlp.w1", # qwen
|
"transformer.h.{bid}.mlp.w1", # qwen
|
||||||
|
@ -240,6 +243,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
|
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
@ -266,6 +270,8 @@ class TensorNameMap:
|
||||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
|
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
||||||
|
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
|
@ -299,6 +305,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.w2", # internlm2
|
"model.layers.{bid}.feed_forward.w2", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||||
|
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
@ -317,6 +324,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||||
|
"encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_K_NORM: (
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
|
@ -324,6 +332,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||||
|
"encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
|
@ -334,6 +343,7 @@ class TensorNameMap:
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
"encoder.layers.{bid}.norm2", # nomic-bert
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
||||||
|
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_IN: (
|
MODEL_TENSOR.SSM_IN: (
|
||||||
|
|
3
llama.h
3
llama.h
|
@ -242,6 +242,9 @@ extern "C" {
|
||||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||||
const float * tensor_split;
|
const float * tensor_split;
|
||||||
|
|
||||||
|
// comma separated list of RPC servers to use for offloading
|
||||||
|
const char * rpc_servers;
|
||||||
|
|
||||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
// If the provided progress_callback returns true, model loading continues.
|
// If the provided progress_callback returns true, model loading continues.
|
||||||
// If it returns false, model loading is immediately aborted.
|
// If it returns false, model loading is immediately aborted.
|
||||||
|
|
|
@ -104,3 +104,5 @@ __ggml_vocab_test__
|
||||||
|
|
||||||
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
||||||
__ggml_vocab_test__
|
__ggml_vocab_test__
|
||||||
|
Việt
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
|
@ -41,3 +41,4 @@
|
||||||
8765 8765 1644
|
8765 8765 1644
|
||||||
8765 8765 8765
|
8765 8765 8765
|
||||||
198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
|
198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
|
||||||
|
101798
|
||||||
|
|
|
@ -9,5 +9,4 @@
|
||||||
-r ./requirements/requirements-convert-hf-to-gguf.txt
|
-r ./requirements/requirements-convert-hf-to-gguf.txt
|
||||||
-r ./requirements/requirements-convert-hf-to-gguf-update.txt
|
-r ./requirements/requirements-convert-hf-to-gguf-update.txt
|
||||||
-r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
|
-r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
|
||||||
-r ./requirements/requirements-convert-lora-to-ggml.txt
|
|
||||||
-r ./requirements/requirements-convert-persimmon-to-gguf.txt
|
-r ./requirements/requirements-convert-persimmon-to-gguf.txt
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
-r ./requirements-convert.txt
|
|
||||||
torch~=2.1.1
|
|
|
@ -325,8 +325,12 @@ table = []
|
||||||
for row in rows_show:
|
for row in rows_show:
|
||||||
n_prompt = int(row[-4])
|
n_prompt = int(row[-4])
|
||||||
n_gen = int(row[-3])
|
n_gen = int(row[-3])
|
||||||
assert n_prompt == 0 or n_gen == 0
|
if n_prompt != 0 and n_gen == 0:
|
||||||
test_name = f"tg{n_gen}" if n_prompt == 0 else f"pp{n_prompt}"
|
test_name = f"pp{n_prompt}"
|
||||||
|
elif n_prompt == 0 and n_gen != 0:
|
||||||
|
test_name = f"tg{n_gen}"
|
||||||
|
else:
|
||||||
|
test_name = f"pp{n_prompt}+tg{n_gen}"
|
||||||
# Regular columns test name avg t/s values Speedup
|
# Regular columns test name avg t/s values Speedup
|
||||||
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
# VVVVVVVVVVVVV VVVVVVVVV VVVVVVVVVVVVVV VVVVVVV
|
||||||
table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
table.append(list(row[:-4]) + [test_name] + list(row[-2:]) + [float(row[-1]) / float(row[-2])])
|
||||||
|
|
117
scripts/debug-test.sh
Executable file
117
scripts/debug-test.sh
Executable file
|
@ -0,0 +1,117 @@
|
||||||
|
#!/bin/bash
|
||||||
|
test_suite=${1:-}
|
||||||
|
test_number=${2:-}
|
||||||
|
|
||||||
|
PROG=${0##*/}
|
||||||
|
build_dir="build-ci-debug"
|
||||||
|
|
||||||
|
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||||
|
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||||
|
echo "Debug specific ctest program."
|
||||||
|
echo
|
||||||
|
echo "Options:"
|
||||||
|
echo " -h, --help Display this help and exit"
|
||||||
|
echo
|
||||||
|
echo "Arguments:"
|
||||||
|
echo " <test_regex> (Mandatory) Supply one regex to the script to filter tests"
|
||||||
|
echo " (test_number) (Optional) Test number to run a specific test"
|
||||||
|
echo
|
||||||
|
echo "Example:"
|
||||||
|
echo " $PROG test-tokenizer"
|
||||||
|
echo " $PROG test-tokenizer 3"
|
||||||
|
echo
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Function to select and debug a test
|
||||||
|
function select_test() {
|
||||||
|
test_suite=${1:-test}
|
||||||
|
test_number=${2:-}
|
||||||
|
|
||||||
|
# Sanity Check If Tests Is Detected
|
||||||
|
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||||
|
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||||
|
if [ ${#tests[@]} -eq 0 ]
|
||||||
|
then
|
||||||
|
echo "No tests avaliable... check your compliation process..."
|
||||||
|
echo "Exiting."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z $test_number ]
|
||||||
|
then
|
||||||
|
# List out avaliable tests
|
||||||
|
printf "Which test would you like to debug?\n"
|
||||||
|
id=0
|
||||||
|
for s in "${tests[@]}"
|
||||||
|
do
|
||||||
|
echo "Test# ${id}"
|
||||||
|
echo " $s"
|
||||||
|
((id++))
|
||||||
|
done
|
||||||
|
|
||||||
|
# Prompt user which test they wanted to run
|
||||||
|
printf "\nRun test#? "
|
||||||
|
read test_number
|
||||||
|
else
|
||||||
|
printf "\nUser Already Requested #${test_number}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Start GDB with the requested test binary and arguments
|
||||||
|
printf "Debugging(GDB) test: ${tests[test_number]}\n"
|
||||||
|
# Change IFS (Internal Field Separator)
|
||||||
|
sIFS=$IFS
|
||||||
|
IFS=$'\n'
|
||||||
|
|
||||||
|
# Get test args
|
||||||
|
gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
|
||||||
|
IFS=$sIFS
|
||||||
|
printf "Debug arguments: ${gdb_args[test_number]}\n\n"
|
||||||
|
|
||||||
|
# Expand paths if needed
|
||||||
|
args=()
|
||||||
|
for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
|
||||||
|
do
|
||||||
|
args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
|
||||||
|
done
|
||||||
|
|
||||||
|
# Execute debugger
|
||||||
|
echo "gdb args: ${args[@]}"
|
||||||
|
gdb --args ${args[@]}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Step 0: Check the args
|
||||||
|
if [ -z "$test_suite" ]
|
||||||
|
then
|
||||||
|
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||||
|
echo "Supply one regex to the script to filter tests,"
|
||||||
|
echo "and optionally a test number to run a specific test."
|
||||||
|
echo "Use --help flag for full instructions"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 1: Reset and Setup folder context
|
||||||
|
## Sanity check that we are actually in a git repo
|
||||||
|
repo_root=$(git rev-parse --show-toplevel)
|
||||||
|
if [ ! -d "$repo_root" ]; then
|
||||||
|
echo "Error: Not in a Git repository."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
## Reset folder to root context of git repo
|
||||||
|
pushd "$repo_root" || exit 1
|
||||||
|
|
||||||
|
## Create and enter build directory
|
||||||
|
rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
|
||||||
|
|
||||||
|
# Step 2: Setup Build Environment and Compile Test Binaries
|
||||||
|
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
|
||||||
|
pushd "$build_dir" && make -j || exit 1
|
||||||
|
|
||||||
|
# Step 3: Debug the Test
|
||||||
|
select_test "$test_suite" "$test_number"
|
||||||
|
|
||||||
|
# Step 4: Return to the directory from which the user ran the command.
|
||||||
|
popd || exit 1
|
||||||
|
popd || exit 1
|
||||||
|
popd || exit 1
|
|
@ -112,6 +112,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
# src/ggml-opencl.h -> ggml-opencl.h
|
# src/ggml-opencl.h -> ggml-opencl.h
|
||||||
# src/ggml-quants.c -> ggml-quants.c
|
# src/ggml-quants.c -> ggml-quants.c
|
||||||
# src/ggml-quants.h -> ggml-quants.h
|
# src/ggml-quants.h -> ggml-quants.h
|
||||||
|
# src/ggml-rpc.cpp -> ggml-rpc.cpp
|
||||||
|
# src/ggml-rpc.h -> ggml-rpc.h
|
||||||
# src/ggml-sycl.cpp -> ggml-sycl.cpp
|
# src/ggml-sycl.cpp -> ggml-sycl.cpp
|
||||||
# src/ggml-sycl.h -> ggml-sycl.h
|
# src/ggml-sycl.h -> ggml-sycl.h
|
||||||
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
|
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
|
||||||
|
@ -149,6 +151,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||||
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||||
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
||||||
|
-e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
|
||||||
|
-e 's/src\/ggml-rpc\.h/ggml-rpc.h/g' \
|
||||||
-e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
|
-e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
|
||||||
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
|
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
|
||||||
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
|
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
98875cdb7e9ceeb726d1c196d2fecb3cbb59b93a
|
126d34985705a5a2222723c145cb4e125ac689f3
|
||||||
|
|
|
@ -20,6 +20,8 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
||||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
|
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
|
||||||
|
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp
|
||||||
|
cp -rpv ../ggml/src/ggml-rpc.h ./ggml-rpc.h
|
||||||
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp
|
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp
|
||||||
cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h
|
cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h
|
||||||
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp
|
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp
|
||||||
|
|
|
@ -92,7 +92,7 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
||||||
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
||||||
|
|
||||||
# TODO: disabled due to slowness
|
# TODO: disabled due to slowness
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include <ggml-alloc.h>
|
#include <ggml-alloc.h>
|
||||||
#include <ggml-backend.h>
|
#include <ggml-backend.h>
|
||||||
#include <ggml-backend-impl.h>
|
#include <ggml-backend-impl.h>
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
|
@ -1111,11 +1112,7 @@ struct test_soft_max : public test_case {
|
||||||
if (this->mask) {
|
if (this->mask) {
|
||||||
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
|
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
|
||||||
}
|
}
|
||||||
ggml_tensor * pos = nullptr;
|
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
|
||||||
if (max_bias > 0.0f) {
|
|
||||||
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
|
|
||||||
}
|
|
||||||
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
|
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1332,23 +1329,47 @@ struct test_upscale : public test_case {
|
||||||
const ggml_type type;
|
const ggml_type type;
|
||||||
const std::array<int64_t, 4> ne;
|
const std::array<int64_t, 4> ne;
|
||||||
const int32_t scale_factor;
|
const int32_t scale_factor;
|
||||||
|
const bool transpose;
|
||||||
|
|
||||||
std::string vars() override {
|
std::string vars() override {
|
||||||
return VARS_TO_STR3(type, ne, scale_factor);
|
return VARS_TO_STR4(type, ne, scale_factor, transpose);
|
||||||
}
|
}
|
||||||
|
|
||||||
test_upscale(ggml_type type = GGML_TYPE_F32,
|
test_upscale(ggml_type type = GGML_TYPE_F32,
|
||||||
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
||||||
int32_t scale_factor = 2)
|
int32_t scale_factor = 2, bool transpose = false)
|
||||||
: type(type), ne(ne), scale_factor(scale_factor) {}
|
: type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||||
|
if (transpose) a = ggml_transpose(ctx, a);
|
||||||
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// GGML_OP_UPSCALE (ext)
|
||||||
|
struct test_upscale_ext : public test_case {
|
||||||
|
const ggml_type type;
|
||||||
|
const std::array<int64_t, 4> ne;
|
||||||
|
const std::array<int64_t, 4> ne_tgt;
|
||||||
|
|
||||||
|
std::string vars() override {
|
||||||
|
return VARS_TO_STR3(type, ne, ne_tgt);
|
||||||
|
}
|
||||||
|
|
||||||
|
test_upscale_ext(ggml_type type = GGML_TYPE_F32,
|
||||||
|
std::array<int64_t, 4> ne = {2, 5, 7, 11},
|
||||||
|
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
|
||||||
|
: type(type), ne(ne), ne_tgt(ne_tgt) {}
|
||||||
|
|
||||||
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||||
|
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// GGML_OP_GROUP_NORM
|
// GGML_OP_GROUP_NORM
|
||||||
struct test_group_norm : public test_case {
|
struct test_group_norm : public test_case {
|
||||||
const ggml_type type;
|
const ggml_type type;
|
||||||
|
@ -1490,23 +1511,27 @@ struct test_flash_attn_ext : public test_case {
|
||||||
const int64_t kv; // kv size
|
const int64_t kv; // kv size
|
||||||
const int64_t nb; // batch size
|
const int64_t nb; // batch size
|
||||||
|
|
||||||
|
const bool mask; // use mask
|
||||||
|
|
||||||
|
const float max_bias; // ALiBi
|
||||||
|
|
||||||
std::string vars() override {
|
std::string vars() override {
|
||||||
return VARS_TO_STR4(hs, nh, kv, nb);
|
return VARS_TO_STR6(hs, nh, kv, nb, mask, max_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
double max_nmse_err() override {
|
double max_nmse_err() override {
|
||||||
return 5e-4;
|
return 5e-4;
|
||||||
}
|
}
|
||||||
|
|
||||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
|
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f)
|
||||||
: hs(hs), nh(nh), kv(kv), nb(nb) {}
|
: hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {}
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
||||||
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||||
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
|
||||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
|
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1611,7 +1636,7 @@ public:
|
||||||
|
|
||||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||||
|
|
||||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f);
|
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
|
||||||
|
|
||||||
// split cached v into n_head heads
|
// split cached v into n_head heads
|
||||||
struct ggml_tensor * v =
|
struct ggml_tensor * v =
|
||||||
|
@ -2128,6 +2153,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
#endif
|
#endif
|
||||||
for (bool mask : {false, true}) {
|
for (bool mask : {false, true}) {
|
||||||
for (float max_bias : {0.0f, 8.0f}) {
|
for (float max_bias : {0.0f, 8.0f}) {
|
||||||
|
if (!mask && max_bias > 0.0f) continue;
|
||||||
for (float scale : {1.0f, 0.1f}) {
|
for (float scale : {1.0f, 0.1f}) {
|
||||||
for (int64_t ne0 : {16, 1024}) {
|
for (int64_t ne0 : {16, 1024}) {
|
||||||
for (int64_t ne1 : {16, 1024}) {
|
for (int64_t ne1 : {16, 1024}) {
|
||||||
|
@ -2141,7 +2167,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
|
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 8.0f));
|
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
||||||
|
|
||||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||||
|
@ -2168,6 +2193,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
|
|
||||||
test_cases.emplace_back(new test_sum_rows());
|
test_cases.emplace_back(new test_sum_rows());
|
||||||
test_cases.emplace_back(new test_upscale());
|
test_cases.emplace_back(new test_upscale());
|
||||||
|
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
||||||
|
test_cases.emplace_back(new test_upscale_ext());
|
||||||
test_cases.emplace_back(new test_group_norm());
|
test_cases.emplace_back(new test_group_norm());
|
||||||
test_cases.emplace_back(new test_acc());
|
test_cases.emplace_back(new test_acc());
|
||||||
test_cases.emplace_back(new test_pad());
|
test_cases.emplace_back(new test_pad());
|
||||||
|
@ -2175,15 +2202,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_timestep_embedding());
|
test_cases.emplace_back(new test_timestep_embedding());
|
||||||
test_cases.emplace_back(new test_leaky_relu());
|
test_cases.emplace_back(new test_leaky_relu());
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
for (int hs : { 64, 128, }) { // other head sizes not implemented
|
|
||||||
#else
|
|
||||||
for (int hs : { 64, 80, 128, 256, }) {
|
for (int hs : { 64, 80, 128, 256, }) {
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
for (bool mask : { true, false } ) {
|
||||||
|
for (float max_bias : { 0.0f, 8.0f }) {
|
||||||
|
if (!mask && max_bias > 0.0f) continue;
|
||||||
for (int nh : { 32, }) {
|
for (int nh : { 32, }) {
|
||||||
for (int kv : { 512, 1024, }) {
|
for (int kv : { 512, 1024, }) {
|
||||||
for (int nb : { 1, 2, 4, 8, }) {
|
for (int nb : { 1, 2, 4, 8, }) {
|
||||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
|
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,19 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
|
||||||
return grammar;
|
return grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool test_build_grammar_fails(const std::string & grammar_str) {
|
||||||
|
fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
|
||||||
|
bool grammar_fails = false;
|
||||||
|
try {
|
||||||
|
build_grammar(grammar_str);
|
||||||
|
fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
grammar_fails = true;
|
||||||
|
fprintf(stdout, " ✅︎\n");
|
||||||
|
}
|
||||||
|
return grammar_fails;
|
||||||
|
}
|
||||||
|
|
||||||
static bool match_string(const std::string & input, llama_grammar* grammar) {
|
static bool match_string(const std::string & input, llama_grammar* grammar) {
|
||||||
auto decoded = decode_utf8(input, {});
|
auto decoded = decode_utf8(input, {});
|
||||||
|
|
||||||
|
@ -320,6 +333,38 @@ number ::= [0-9]+)""";
|
||||||
fprintf(stderr, " ✅︎ Passed\n");
|
fprintf(stderr, " ✅︎ Passed\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void test_failure_left_recursion() {
|
||||||
|
fprintf(stderr, "⚫ Testing left recursion detection:\n");
|
||||||
|
|
||||||
|
// Test simple left recursion detection
|
||||||
|
const std::string simple_str = R"""(root ::= "a" | root "a")""";
|
||||||
|
assert(test_build_grammar_fails(simple_str));
|
||||||
|
|
||||||
|
// Test more complicated left recursion detection
|
||||||
|
const std::string medium_str = R"""(
|
||||||
|
root ::= asdf
|
||||||
|
asdf ::= "a" | asdf "a"
|
||||||
|
)""";
|
||||||
|
assert(test_build_grammar_fails(medium_str));
|
||||||
|
|
||||||
|
// Test even more complicated left recursion detection
|
||||||
|
const std::string hard_str = R"""(
|
||||||
|
root ::= asdf
|
||||||
|
asdf ::= "a" | foo "b"
|
||||||
|
foo ::= "c" | asdf "d" | "e")""";
|
||||||
|
assert(test_build_grammar_fails(hard_str));
|
||||||
|
|
||||||
|
// Test yet even more complicated left recursion detection
|
||||||
|
const std::string hardest_str = R"""(
|
||||||
|
root ::= asdf
|
||||||
|
asdf ::= "a" | foo "b"
|
||||||
|
foo ::= "c" | empty asdf "d" | "e"
|
||||||
|
empty ::= "blah" | )""";
|
||||||
|
assert(test_build_grammar_fails(hardest_str));
|
||||||
|
|
||||||
|
fprintf(stderr, " ✅︎ Passed\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
fprintf(stdout, "Running grammar integration tests...\n");
|
fprintf(stdout, "Running grammar integration tests...\n");
|
||||||
test_simple_grammar();
|
test_simple_grammar();
|
||||||
|
@ -327,6 +372,7 @@ int main() {
|
||||||
test_quantifiers();
|
test_quantifiers();
|
||||||
test_failure_missing_root();
|
test_failure_missing_root();
|
||||||
test_failure_missing_reference();
|
test_failure_missing_reference();
|
||||||
|
test_failure_left_recursion();
|
||||||
fprintf(stdout, "All tests passed.\n");
|
fprintf(stdout, "All tests passed.\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,15 +13,27 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2 || argc > 3) {
|
||||||
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string fname = argv[1];
|
const std::string fname = argv[1];
|
||||||
|
bool ignore_merges = false;
|
||||||
|
if (argc == 3) {
|
||||||
|
if (std::strcmp(argv[2], "--ignore-merges") != 0) {
|
||||||
|
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
ignore_merges = true;
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
if (ignore_merges) {
|
||||||
|
fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
|
@ -65,7 +77,19 @@ int main(int argc, char **argv) {
|
||||||
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
|
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
|
||||||
try {
|
try {
|
||||||
auto cps = unicode_cpts_from_utf8(str);
|
auto cps = unicode_cpts_from_utf8(str);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
|
||||||
|
if (ignore_merges && tokens.size() > 1) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"%s : error: token %d detokenizes to '%s'(%zu) but "
|
||||||
|
"tokenization of this to multiple tokens: [",
|
||||||
|
__func__, i, str.c_str(), str.length());
|
||||||
|
fprintf(stderr, "%d", tokens[0]);
|
||||||
|
for (size_t i = 1; i < tokens.size(); i++) {
|
||||||
|
fprintf(stderr, ", %d", tokens[i]);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "]\n");
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||||
if (check != str) {
|
if (check != str) {
|
||||||
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue