Merge branch 'gg/flash-attn' of https://github.com/ggerganov/llama.cpp into flash-attn-cuda
This commit is contained in:
commit
3df0b8d47c
85 changed files with 8185 additions and 2081 deletions
1
.ecrc
1
.ecrc
|
@ -1,4 +1,5 @@
|
|||
{
|
||||
"Exclude": ["^\\.gitmodules$"],
|
||||
"Disable": {
|
||||
"IndentSize": true
|
||||
}
|
||||
|
|
46
.github/workflows/build.yml
vendored
46
.github/workflows/build.yml
vendored
|
@ -337,6 +337,7 @@ jobs:
|
|||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.6.0
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
VULKAN_VERSION: 1.3.261.1
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -353,6 +354,8 @@ jobs:
|
|||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
|
@ -361,6 +364,12 @@ jobs:
|
|||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
run: |
|
||||
git submodule update --init kompute
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
|
@ -395,6 +404,15 @@ jobs:
|
|||
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
|
||||
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
|
@ -432,7 +450,8 @@ jobs:
|
|||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
@ -546,6 +565,31 @@ jobs:
|
|||
path: |
|
||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||
|
||||
windows-latest-cmake-sycl:
|
||||
runs-on: windows-latest
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
||||
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install
|
||||
run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: examples/sycl/win-build-sycl.bat
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
|
||||
|
|
6
.github/workflows/editorconfig.yml
vendored
6
.github/workflows/editorconfig.yml
vendored
|
@ -1,6 +1,12 @@
|
|||
name: EditorConfig Checker
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -89,3 +89,4 @@ examples/jeopardy/results.txt
|
|||
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
nppBackup
|
||||
|
|
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
[submodule "kompute"]
|
||||
path = kompute
|
||||
url = https://github.com/nomic-ai/kompute.git
|
172
CMakeLists.txt
172
CMakeLists.txt
|
@ -103,6 +103,7 @@ option(LLAMA_VULKAN "llama: use Vulkan"
|
|||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
|
@ -422,7 +423,13 @@ if (LLAMA_VULKAN)
|
|||
if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
set(GGML_HEADERS_VULKAN ggml-vulkan.h)
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
|
||||
|
||||
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
|
||||
|
||||
add_compile_definitions(GGML_USE_VULKAN)
|
||||
|
@ -478,7 +485,6 @@ if (LLAMA_HIPBLAS)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
if (LLAMA_SYCL)
|
||||
if ( NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
|
||||
|
@ -501,7 +507,165 @@ if (LLAMA_SYCL)
|
|||
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
|
||||
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
if (WIN32)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
|
||||
else()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_KOMPUTE)
|
||||
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
|
||||
if (NOT glslc_executable)
|
||||
message(FATAL_ERROR "glslc not found")
|
||||
endif()
|
||||
|
||||
function(compile_shader)
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(multiValueArgs SOURCES)
|
||||
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
foreach(source ${compile_shader_SOURCES})
|
||||
get_filename_component(filename ${source} NAME)
|
||||
set(spv_file ${filename}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${spv_file}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
COMMENT "Compiling ${source} to ${spv_file}"
|
||||
)
|
||||
|
||||
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
|
||||
set(FILE_NAME "shader${RAW_FILE_NAME}")
|
||||
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
|
||||
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
|
||||
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
|
||||
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
|
||||
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
|
||||
)
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||
message(STATUS "Kompute found")
|
||||
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
|
||||
add_subdirectory(kompute)
|
||||
|
||||
# Compile our shaders
|
||||
compile_shader(SOURCES
|
||||
kompute-shaders/op_scale.comp
|
||||
kompute-shaders/op_scale_8.comp
|
||||
kompute-shaders/op_add.comp
|
||||
kompute-shaders/op_addrow.comp
|
||||
kompute-shaders/op_mul.comp
|
||||
kompute-shaders/op_silu.comp
|
||||
kompute-shaders/op_relu.comp
|
||||
kompute-shaders/op_gelu.comp
|
||||
kompute-shaders/op_softmax.comp
|
||||
kompute-shaders/op_norm.comp
|
||||
kompute-shaders/op_rmsnorm.comp
|
||||
kompute-shaders/op_diagmask.comp
|
||||
kompute-shaders/op_mul_mat_mat_f32.comp
|
||||
kompute-shaders/op_mul_mat_f16.comp
|
||||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
kompute-shaders/op_getrows_q4_0.comp
|
||||
kompute-shaders/op_getrows_q4_1.comp
|
||||
kompute-shaders/op_getrows_q6_k.comp
|
||||
kompute-shaders/op_rope_f16.comp
|
||||
kompute-shaders/op_rope_f32.comp
|
||||
kompute-shaders/op_cpy_f16_f16.comp
|
||||
kompute-shaders/op_cpy_f16_f32.comp
|
||||
kompute-shaders/op_cpy_f32_f16.comp
|
||||
kompute-shaders/op_cpy_f32_f32.comp
|
||||
)
|
||||
|
||||
# Create a custom target for our generated shaders
|
||||
add_custom_target(generated_shaders DEPENDS
|
||||
shaderop_scale.h
|
||||
shaderop_scale_8.h
|
||||
shaderop_add.h
|
||||
shaderop_addrow.h
|
||||
shaderop_mul.h
|
||||
shaderop_silu.h
|
||||
shaderop_relu.h
|
||||
shaderop_gelu.h
|
||||
shaderop_softmax.h
|
||||
shaderop_norm.h
|
||||
shaderop_rmsnorm.h
|
||||
shaderop_diagmask.h
|
||||
shaderop_mul_mat_mat_f32.h
|
||||
shaderop_mul_mat_f16.h
|
||||
shaderop_mul_mat_q8_0.h
|
||||
shaderop_mul_mat_q4_0.h
|
||||
shaderop_mul_mat_q4_1.h
|
||||
shaderop_mul_mat_q6_k.h
|
||||
shaderop_getrows_f16.h
|
||||
shaderop_getrows_q4_0.h
|
||||
shaderop_getrows_q4_1.h
|
||||
shaderop_getrows_q6_k.h
|
||||
shaderop_rope_f16.h
|
||||
shaderop_rope_f32.h
|
||||
shaderop_cpy_f16_f16.h
|
||||
shaderop_cpy_f16_f32.h
|
||||
shaderop_cpy_f32_f16.h
|
||||
shaderop_cpy_f32_f32.h
|
||||
)
|
||||
|
||||
# Create a custom command that depends on the generated_shaders
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
DEPENDS generated_shaders
|
||||
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
|
||||
)
|
||||
|
||||
# Add the stamp to the main sources to ensure dependency tracking
|
||||
set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
add_compile_definitions(GGML_USE_KOMPUTE)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
|
||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
function(get_flags CCID CCVER)
|
||||
|
@ -848,10 +1012,12 @@ add_library(ggml OBJECT
|
|||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||
|
@ -928,7 +1094,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
|||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_VULKAN}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
|
|
@ -8,10 +8,14 @@
|
|||
|
||||
[Linux](#linux)
|
||||
|
||||
[Windows](#windows)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Q&A](#q&a)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
@ -33,7 +37,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
|||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|Windows|Support|Windows 11|
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
@ -42,7 +46,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
|||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel Arc Series| Support| Arc 770, 730M|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
@ -131,6 +135,7 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
|||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
cd ..
|
||||
```
|
||||
|
||||
or
|
||||
|
@ -195,7 +200,7 @@ GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building
|
|||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_llama2.sh
|
||||
./examples/sycl/run-llama2.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
@ -205,11 +210,175 @@ Note:
|
|||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
## Windows
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Enable oneAPI running environment:
|
||||
|
||||
- In Search, input 'oneAPI'.
|
||||
|
||||
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
|
||||
|
||||
- In Run:
|
||||
|
||||
In CMD:
|
||||
```
|
||||
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||
```
|
||||
|
||||
c. Check GPU
|
||||
|
||||
In oneAPI command line:
|
||||
|
||||
```
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
|
||||
|
||||
```
|
||||
|
||||
3. Install cmake & make
|
||||
|
||||
a. Download & install cmake for windows: https://cmake.org/download/
|
||||
|
||||
b. Download & install make for windows provided by mingw-w64: https://www.mingw-w64.org/downloads/
|
||||
|
||||
|
||||
### Build locally:
|
||||
|
||||
In oneAPI command line window:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
|
||||
:: build example/main only
|
||||
:: make main
|
||||
|
||||
:: build all binary
|
||||
make -j
|
||||
cd ..
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
.\examples\sycl\win-build-sycl.bat
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
- In Search, input 'oneAPI'.
|
||||
|
||||
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
|
||||
|
||||
- In Run:
|
||||
|
||||
In CMD:
|
||||
```
|
||||
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
build\bin\ls-sycl-device.exe
|
||||
|
||||
or
|
||||
|
||||
build\bin\main.exe
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
|
||||
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
set GGML_SYCL_DEVICE=0
|
||||
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
.\examples\sycl\win-run-llama2.bat
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
## Environment Variable
|
||||
|
||||
|
@ -220,7 +389,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
|||
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
@ -232,19 +401,24 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
|||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Q&A
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
- In Windows, no result, not error.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
16
README.md
16
README.md
|
@ -10,7 +10,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|||
|
||||
### Hot topics
|
||||
|
||||
- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||
- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240
|
||||
- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||
- [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
|
||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
|
@ -605,7 +607,7 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
|
||||
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
|
||||
|
||||
|
||||
### Prepare Data & Run
|
||||
|
@ -614,9 +616,9 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
# obtain the original LLaMA model weights and place them in ./models
|
||||
ls ./models
|
||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||
# [Optional] for models using BPE tokenizers
|
||||
ls ./models
|
||||
65B 30B 13B 7B vocab.json
|
||||
# [Optional] for models using BPE tokenizers
|
||||
ls ./models
|
||||
65B 30B 13B 7B vocab.json
|
||||
|
||||
# install Python dependencies
|
||||
python3 -m pip install -r requirements.txt
|
||||
|
@ -624,8 +626,8 @@ python3 -m pip install -r requirements.txt
|
|||
# convert the 7B model to ggml FP16 format
|
||||
python3 convert.py models/7B/
|
||||
|
||||
# [Optional] for models using BPE tokenizers
|
||||
python convert.py models/7B/ --vocabtype bpe
|
||||
# [Optional] for models using BPE tokenizers
|
||||
python convert.py models/7B/ --vocabtype bpe
|
||||
|
||||
# quantize the model to 4-bits (using q4_0 method)
|
||||
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
|
||||
|
|
|
@ -583,20 +583,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
}
|
||||
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
}
|
||||
} else if (arg == "--main-gpu" || arg == "-mg") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -637,11 +637,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
|
||||
if (split_arg.size() >= llama_max_devices()) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
|
@ -989,16 +989,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
|
||||
if (llama_mlock_supported()) {
|
||||
if (llama_supports_mlock()) {
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_mmap_supported()) {
|
||||
if (llama_supports_mmap()) {
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
if (llama_supports_gpu_offload()) {
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
||||
|
@ -1012,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
}
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
|
@ -1520,7 +1520,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
|
@ -1649,7 +1651,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
||||
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
||||
|
||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
||||
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
||||
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
||||
|
||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||
|
|
|
@ -63,7 +63,7 @@ struct gpt_params {
|
|||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
|
||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||
int32_t grp_attn_n = 1; // group-attention factor
|
||||
int32_t grp_attn_w = 512; // group-attention width
|
||||
|
|
|
@ -1363,12 +1363,12 @@ bool consume_common_train_arg(
|
|||
*invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
if (llama_supports_gpu_offload()) {
|
||||
params->n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
} else {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
#endif
|
||||
}
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
params->print_usage = true;
|
||||
return true;
|
||||
|
|
|
@ -334,7 +334,10 @@ class Params:
|
|||
class BpeVocab:
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||
try:
|
||||
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
||||
except KeyError:
|
||||
self.vocab = self.bpe_tokenizer
|
||||
added_tokens: dict[str, int]
|
||||
if fname_added_tokens is not None:
|
||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||
|
|
|
@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
|
||||
const std::vector<float> t_split(llama_max_devices(), 0.0f);
|
||||
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
model_params.tensor_split = t_split.data();
|
||||
|
|
|
@ -160,7 +160,7 @@ struct cmd_params {
|
|||
std::vector<int> main_gpu;
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> mul_mat_q;
|
||||
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
int reps;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
|
@ -179,7 +179,7 @@ static const cmd_params cmd_params_defaults = {
|
|||
/* main_gpu */ {0},
|
||||
/* no_kv_offload */ {false},
|
||||
/* mul_mat_q */ {true},
|
||||
/* tensor_split */ {{}},
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN
|
||||
|
@ -380,10 +380,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
const std::regex regex{R"([;/]+)"};
|
||||
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
||||
|
||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||
std::vector<float> tensor_split(llama_max_devices());
|
||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||
if (i < split_arg.size()) {
|
||||
tensor_split[i] = std::stof(split_arg[i]);
|
||||
} else {
|
||||
|
@ -459,7 +459,7 @@ struct cmd_params_instance {
|
|||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool mul_mat_q;
|
||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||
std::vector<float> tensor_split;
|
||||
|
||||
llama_model_params to_llama_mparams() const {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
|
@ -563,6 +563,7 @@ struct test {
|
|||
static const bool cuda;
|
||||
static const bool opencl;
|
||||
static const bool vulkan;
|
||||
static const bool kompute;
|
||||
static const bool metal;
|
||||
static const bool gpu_blas;
|
||||
static const bool blas;
|
||||
|
@ -581,7 +582,7 @@ struct test {
|
|||
int main_gpu;
|
||||
bool no_kv_offload;
|
||||
bool mul_mat_q;
|
||||
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
|
||||
std::vector<float> tensor_split;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
std::string test_time;
|
||||
|
@ -647,6 +648,9 @@ struct test {
|
|||
if (vulkan) {
|
||||
return "Vulkan";
|
||||
}
|
||||
if (kompute) {
|
||||
return "Kompute";
|
||||
}
|
||||
if (metal) {
|
||||
return "Metal";
|
||||
}
|
||||
|
@ -662,7 +666,7 @@ struct test {
|
|||
static const std::vector<std::string> & get_fields() {
|
||||
static const std::vector<std::string> fields = {
|
||||
"build_commit", "build_number",
|
||||
"cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas",
|
||||
"cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_threads", "type_k", "type_v",
|
||||
|
@ -686,8 +690,9 @@ struct test {
|
|||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" ||
|
||||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
|
||||
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "mul_mat_q") {
|
||||
return BOOL;
|
||||
}
|
||||
if (field == "avg_ts" || field == "stddev_ts") {
|
||||
|
@ -699,7 +704,7 @@ struct test {
|
|||
std::vector<std::string> get_values() const {
|
||||
std::string tensor_split_str;
|
||||
int max_nonzero = 0;
|
||||
for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
|
||||
for (size_t i = 0; i < llama_max_devices(); i++) {
|
||||
if (tensor_split[i] > 0) {
|
||||
max_nonzero = i;
|
||||
}
|
||||
|
@ -714,7 +719,8 @@ struct test {
|
|||
}
|
||||
std::vector<std::string> values = {
|
||||
build_commit, std::to_string(build_number),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
||||
std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
|
@ -743,6 +749,7 @@ const int test::build_number = LLAMA_BUILD_NUMBER;
|
|||
const bool test::cuda = !!ggml_cpu_has_cublas();
|
||||
const bool test::opencl = !!ggml_cpu_has_clblast();
|
||||
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
||||
const bool test::kompute = !!ggml_cpu_has_kompute();
|
||||
const bool test::metal = !!ggml_cpu_has_metal();
|
||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||
const bool test::blas = !!ggml_cpu_has_blas();
|
||||
|
|
|
@ -111,17 +111,71 @@ llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 m
|
|||
llama_print_timings: total time = 34570.79 ms
|
||||
```
|
||||
|
||||
## Orin compile and run
|
||||
### compile
|
||||
```sh
|
||||
make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
||||
```
|
||||
|
||||
### run on Orin
|
||||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
./llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
--image /data/local/tmp/demo.jpeg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:" \
|
||||
--n-gpu-layers 999
|
||||
```
|
||||
**output**
|
||||
```sh
|
||||
|
||||
encode_image_with_clip: image encoded in 296.62 ms by CLIP ( 2.06 ms per image patch)
|
||||
|
||||
Susan Wise Bauer
|
||||
|
||||
llama_print_timings: load time = 1067.64 ms
|
||||
llama_print_timings: sample time = 1.53 ms / 6 runs ( 0.25 ms per token, 3934.43 tokens per second)
|
||||
llama_print_timings: prompt eval time = 306.84 ms / 246 tokens ( 1.25 ms per token, 801.72 tokens per second)
|
||||
llama_print_timings: eval time = 91.50 ms / 6 runs ( 15.25 ms per token, 65.58 tokens per second)
|
||||
llama_print_timings: total time = 1352.63 ms / 252 tokens
|
||||
```
|
||||
|
||||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
./llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
|
||||
--n-gpu-layers 999
|
||||
|
||||
```
|
||||
**output**
|
||||
```sh
|
||||
encode_image_with_clip: image encoded in 302.15 ms by CLIP ( 2.10 ms per image patch)
|
||||
|
||||
The image features a cat lying in the grass.
|
||||
|
||||
llama_print_timings: load time = 1057.07 ms
|
||||
llama_print_timings: sample time = 3.27 ms / 11 runs ( 0.30 ms per token, 3360.83 tokens per second)
|
||||
llama_print_timings: prompt eval time = 213.60 ms / 232 tokens ( 0.92 ms per token, 1086.14 tokens per second)
|
||||
llama_print_timings: eval time = 166.65 ms / 11 runs ( 15.15 ms per token, 66.01 tokens per second)
|
||||
llama_print_timings: total time = 1365.47 ms / 243 tokens
|
||||
```
|
||||
|
||||
## Minor shortcomings
|
||||
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
|
||||
|
||||
## TODO
|
||||
|
||||
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
|
||||
- [x] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
|
||||
- [ ] Optimize LDP projector performance
|
||||
|
||||
- Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
|
||||
- Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
|
||||
- [ ] run MobileVLM on `Jetson Orin`
|
||||
- [x] run MobileVLM on `Jetson Orin`
|
||||
- [ ] Support more model variants, such as `MobileVLM-3B`.
|
||||
|
||||
|
||||
|
|
|
@ -39,6 +39,17 @@ static std::ostringstream * g_output_ss;
|
|||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
static bool file_exists(const std::string &path) {
|
||||
std::ifstream f(path.c_str());
|
||||
return f.good();
|
||||
}
|
||||
|
||||
static bool file_is_empty(const std::string &path) {
|
||||
std::ifstream f;
|
||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||
return f.tellg() == 0;
|
||||
}
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
|
@ -215,12 +226,12 @@ int main(int argc, char ** argv) {
|
|||
|
||||
if (!path_session.empty()) {
|
||||
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||
|
||||
// fopen to check for existing session
|
||||
FILE * fp = std::fopen(path_session.c_str(), "rb");
|
||||
if (fp != NULL) {
|
||||
std::fclose(fp);
|
||||
|
||||
if (!file_exists(path_session)) {
|
||||
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
|
||||
} else if (file_is_empty(path_session)) {
|
||||
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||
} else {
|
||||
// The file exists and is not empty
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
|
@ -229,10 +240,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
llama_set_rng_seed(ctx, params.seed);
|
||||
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
|
||||
} else {
|
||||
LOG_TEE("%s: session file does not exist, will create\n", __func__);
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -378,6 +378,8 @@ int main(int argc, char ** argv) {
|
|||
printf("testing %s ...\n", ggml_type_name(type));
|
||||
}
|
||||
|
||||
ggml_quantize_init(type);
|
||||
|
||||
error_stats global_stats {};
|
||||
|
||||
for (const auto& kv_tensor : tensors) {
|
||||
|
|
|
@ -25,6 +25,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
|
@ -36,7 +37,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||
|
|
|
@ -32,6 +32,7 @@ Command line options:
|
|||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
|
||||
## Build
|
||||
|
||||
server is build alongside everything else from the root of the project
|
||||
|
@ -52,21 +53,23 @@ server is build alongside everything else from the root of the project
|
|||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
### Unix-based systems (Linux, macOS, etc.):
|
||||
### Unix-based systems (Linux, macOS, etc.)
|
||||
|
||||
```bash
|
||||
./server -m models/7B/ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
### Windows:
|
||||
### Windows
|
||||
|
||||
```powershell
|
||||
server.exe -m models\7B\ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
|
||||
|
||||
### Docker:
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
|
||||
|
||||
|
@ -120,6 +123,7 @@ node index.js
|
|||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
- **GET** `/health`: Returns the current state of the server:
|
||||
- `{"status": "loading model"}` if the model is still being loaded.
|
||||
- `{"status": "error"}` if the model failed to load.
|
||||
|
@ -189,14 +193,13 @@ node index.js
|
|||
|
||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
### Result JSON:
|
||||
|
||||
* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
|
||||
### Result JSON
|
||||
|
||||
- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
|
||||
|
||||
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
|
||||
|
||||
```
|
||||
```json
|
||||
{
|
||||
"content": "<the token selected by the model>",
|
||||
"probs": [
|
||||
|
@ -212,6 +215,7 @@ node index.js
|
|||
]
|
||||
},
|
||||
```
|
||||
|
||||
Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||
|
@ -290,6 +294,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
|||
|
||||
print(completion.choices[0].message)
|
||||
```
|
||||
|
||||
... or raw HTTP requests:
|
||||
|
||||
```shell
|
||||
|
@ -311,6 +316,40 @@ Notice that each `probs` is an array of length `n_probs`.
|
|||
}'
|
||||
```
|
||||
|
||||
- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.
|
||||
|
||||
*Options:*
|
||||
|
||||
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
|
||||
|
||||
*Examples:*
|
||||
|
||||
- input as string
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"input": "hello",
|
||||
"model":"GPT-4",
|
||||
"encoding_format": "float"
|
||||
}'
|
||||
```
|
||||
|
||||
- `input` as string array
|
||||
|
||||
```shell
|
||||
curl http://localhost:8080/v1/embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer no-key" \
|
||||
-d '{
|
||||
"input": ["hello", "world"],
|
||||
"model":"GPT-4",
|
||||
"encoding_format": "float"
|
||||
}'
|
||||
```
|
||||
|
||||
## More examples
|
||||
|
||||
### Change system prompt on runtime
|
||||
|
@ -362,6 +401,7 @@ python api_like_OAI.py
|
|||
```
|
||||
|
||||
After running the API server, you can use it in Python by setting the API base URL.
|
||||
|
||||
```python
|
||||
openai.api_base = "http://<Your api-server IP>:port"
|
||||
```
|
||||
|
|
|
@ -48,6 +48,7 @@ chat_completion() {
|
|||
top_p: 0.9,
|
||||
n_keep: $n_keep,
|
||||
n_predict: 256,
|
||||
cache_prompt: true,
|
||||
stop: ["\n### Human:"],
|
||||
stream: true
|
||||
}')"
|
||||
|
|
|
@ -206,3 +206,18 @@ inline static std::vector<json> format_partial_response_oaicompat(const task_res
|
|||
|
||||
return std::vector<json>({ret});
|
||||
}
|
||||
|
||||
inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
|
||||
{
|
||||
json res =
|
||||
json{
|
||||
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||
{"object", "list"},
|
||||
{"usage",
|
||||
json{{"prompt_tokens", 0},
|
||||
{"total_tokens", 0}}},
|
||||
{"data", embeddings}
|
||||
};
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
@ -185,7 +185,7 @@ struct llama_client_slot
|
|||
llama_sampling_context *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1;// group-attention factor
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
int32_t ga_w = 512; // group-attention width
|
||||
|
||||
int32_t n_past_se = 0; // self-extend
|
||||
|
@ -220,6 +220,7 @@ struct llama_client_slot
|
|||
infill = false;
|
||||
ga_i = 0;
|
||||
n_past_se = 0;
|
||||
|
||||
generated_token_probs.clear();
|
||||
|
||||
for (slot_image & img : images)
|
||||
|
@ -1227,7 +1228,7 @@ struct llama_server_context
|
|||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true);
|
||||
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
|
@ -1295,6 +1296,8 @@ struct llama_server_context
|
|||
for (llama_client_slot &slot : slots)
|
||||
{
|
||||
slot.cache_tokens.clear();
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1364,26 +1367,26 @@ struct llama_server_context
|
|||
kv_cache_clear();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
}
|
||||
|
||||
task_server task;
|
||||
task.type = TASK_TYPE_NEXT_RESPONSE;
|
||||
task.target_id = -1;
|
||||
queue_tasks.post(task);
|
||||
}
|
||||
|
||||
for (llama_client_slot &slot : slots)
|
||||
{
|
||||
if (slot.ga_n == 1)
|
||||
{
|
||||
if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
{
|
||||
// Shift context
|
||||
const int n_left = slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
|
@ -1429,8 +1432,10 @@ struct llama_server_context
|
|||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
|
||||
// TODO: we always have to take into account the "system_tokens"
|
||||
// this is not great and needs to be improved somehow
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
|
||||
|
@ -1591,10 +1596,13 @@ struct llama_server_context
|
|||
|
||||
// process the prefix of first image
|
||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||
|
||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
int ga_i = slot.ga_i;
|
||||
|
||||
int32_t ga_i = slot.ga_i;
|
||||
int32_t ga_n = slot.ga_n;
|
||||
int32_t ga_w = slot.ga_w;
|
||||
|
||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
||||
{
|
||||
if (slot.ga_n != 1)
|
||||
|
@ -1606,7 +1614,7 @@ struct llama_server_context
|
|||
}
|
||||
}
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
slot_npast += 1;
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
|
@ -1666,6 +1674,7 @@ struct llama_server_context
|
|||
slot.n_past_se += n_tokens;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch batch_view =
|
||||
{
|
||||
n_tokens,
|
||||
|
@ -1780,16 +1789,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
if (llama_supports_mlock())
|
||||
{
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_mmap_supported())
|
||||
if (llama_supports_mmap())
|
||||
{
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
if (llama_supports_gpu_offload()) {
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
|
@ -1801,7 +1810,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row)\n");
|
||||
#endif
|
||||
}
|
||||
printf(" -m FNAME, --model FNAME\n");
|
||||
printf(" model path (default: %s)\n", params.model.c_str());
|
||||
printf(" -a ALIAS, --alias ALIAS\n");
|
||||
|
@ -1818,15 +1827,15 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||
printf(" --log-disable disables logging to a file.\n");
|
||||
printf("\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
@ -2057,13 +2066,13 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
if (llama_supports_gpu_offload()) {
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
#else
|
||||
} else {
|
||||
LOG_WARNING("Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
|
||||
"See main README.md for information on enabling GPU BLAS support",
|
||||
{{"n_gpu_layers", params.n_gpu_layers}});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if (arg == "--split-mode" || arg == "-sm")
|
||||
{
|
||||
|
@ -2106,9 +2115,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
const std::regex regex{R"([,/]+)"};
|
||||
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||
std::vector<std::string> split_arg{it, {}};
|
||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
||||
|
||||
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device)
|
||||
for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device)
|
||||
{
|
||||
if (i_device < split_arg.size())
|
||||
{
|
||||
|
@ -2929,6 +2938,66 @@ int main(int argc, char **argv)
|
|||
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||
{
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
const json body = json::parse(req.body);
|
||||
|
||||
json prompt;
|
||||
if (body.count("input") != 0)
|
||||
{
|
||||
prompt = body["input"];
|
||||
// batch
|
||||
if(prompt.is_array()) {
|
||||
json data = json::array();
|
||||
int i = 0;
|
||||
for (const json &elem : prompt) {
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
llama.request_completion(task_id, { {"prompt", elem}, { "n_predict", 0} }, false, true, -1);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
llama.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
json embedding = json{
|
||||
{"embedding", json_value(result.result_json, "embedding", json::array())},
|
||||
{"index", i++},
|
||||
{"object", "embedding"}
|
||||
};
|
||||
data.push_back(embedding);
|
||||
}
|
||||
json result = format_embeddings_response_oaicompat(body, data);
|
||||
return res.set_content(result.dump(), "application/json; charset=utf-8");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
prompt = "";
|
||||
}
|
||||
|
||||
// create and queue the task
|
||||
const int task_id = llama.queue_tasks.get_new_id();
|
||||
llama.queue_results.add_waiting_task_id(task_id);
|
||||
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}}, false, true, -1);
|
||||
|
||||
// get the result
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
llama.queue_results.remove_waiting_task_id(task_id);
|
||||
|
||||
json data = json::array({json{
|
||||
{"embedding", json_value(result.result_json, "embedding", json::array())},
|
||||
{"index", 0},
|
||||
{"object", "embedding"}
|
||||
}}
|
||||
);
|
||||
|
||||
json root = format_embeddings_response_oaicompat(body, data);
|
||||
|
||||
// send the result
|
||||
return res.set_content(root.dump(), "application/json; charset=utf-8");
|
||||
});
|
||||
|
||||
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
|
||||
// "Bus error: 10" - this is on macOS, it does not crash on Linux
|
||||
//std::thread t2([&]()
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
|
|
23
examples/sycl/win-build-sycl.bat
Normal file
23
examples/sycl/win-build-sycl.bat
Normal file
|
@ -0,0 +1,23 @@
|
|||
|
||||
:: MIT license
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
mkdir -p build
|
||||
cd build
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
|
||||
:: build example/main only
|
||||
:: make main
|
||||
|
||||
:: build all binary
|
||||
make -j
|
||||
cd ..
|
13
examples/sycl/win-run-llama2.bat
Normal file
13
examples/sycl/win-run-llama2.bat
Normal file
|
@ -0,0 +1,13 @@
|
|||
:: MIT license
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
|
||||
set GGML_SYCL_DEVICE=0
|
||||
rem set GGML_SYCL_DEBUG=1
|
||||
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
|
@ -791,7 +791,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|||
for (size_t i = 0; i < *n_buffers; i++) {
|
||||
ggml_backend_buffer_free(*buffers[i]);
|
||||
}
|
||||
free(buffers);
|
||||
free(*buffers);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -373,6 +373,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|||
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
|
||||
ggml_backend_vk_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
|
||||
ggml_backend_kompute_reg_devices();
|
||||
#endif
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||
|
|
560
ggml-cuda.cu
560
ggml-cuda.cu
|
@ -197,6 +197,10 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|||
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
|
@ -511,6 +515,14 @@ typedef struct {
|
|||
} block_iq2_xs;
|
||||
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
||||
|
||||
#define QR3_XXS 8
|
||||
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[3*(QK_K/8)];
|
||||
} block_iq3_xxs;
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
#define WARP_SIZE 32
|
||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||
|
||||
|
@ -518,6 +530,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
|
|||
#define CUDA_SILU_BLOCK_SIZE 256
|
||||
#define CUDA_TANH_BLOCK_SIZE 256
|
||||
#define CUDA_RELU_BLOCK_SIZE 256
|
||||
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
|
||||
#define CUDA_HARDSWISH_BLOCK_SIZE 256
|
||||
#define CUDA_SQR_BLOCK_SIZE 256
|
||||
#define CUDA_CPY_BLOCK_SIZE 32
|
||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
||||
|
@ -534,6 +548,7 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
|
|||
#define CUDA_PAD_BLOCK_SIZE 256
|
||||
#define CUDA_ACC_BLOCK_SIZE 256
|
||||
#define CUDA_IM2COL_BLOCK_SIZE 256
|
||||
#define CUDA_POOL2D_BLOCK_SIZE 256
|
||||
|
||||
#define CUDA_Q8_0_NE_ALIGN 2048
|
||||
|
||||
|
@ -837,6 +852,24 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|||
dst[i] = fmaxf(x[i], 0);
|
||||
}
|
||||
|
||||
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static __global__ void hardswish_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
if (i >= k) {
|
||||
|
@ -1640,6 +1673,41 @@ static const __device__ uint64_t iq2xs_grid[512] = {
|
|||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
static const __device__ uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
||||
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
||||
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
||||
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
||||
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
||||
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
||||
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
||||
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
||||
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
||||
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
||||
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
static const __device__ uint8_t ksigns_iq2xs[128] = {
|
||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
||||
|
@ -1651,6 +1719,43 @@ static const __device__ uint8_t ksigns_iq2xs[128] = {
|
|||
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
||||
};
|
||||
|
||||
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
static const __device__ uint64_t ksigns64[128] = {
|
||||
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
||||
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
||||
0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
|
||||
0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
|
||||
0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
|
||||
0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
|
||||
0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
|
||||
0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
|
||||
0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
|
||||
0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
|
||||
0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
|
||||
0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
|
||||
0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
|
||||
0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
|
||||
0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
|
||||
0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
|
||||
0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
|
||||
0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
|
||||
0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
|
||||
0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
|
||||
0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
|
||||
0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
|
||||
0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
|
||||
0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
|
||||
0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
|
||||
0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
|
||||
0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
|
||||
0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
|
||||
0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
|
||||
0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
|
||||
0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
|
||||
0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
|
||||
};
|
||||
//#endif
|
||||
|
||||
static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
||||
|
||||
inline bool ggml_cuda_supports_mmq(enum ggml_type type) {
|
||||
|
@ -1717,6 +1822,34 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
|||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
const int i = blockIdx.x;
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const uint8_t * q3 = x[i].qs + 8*ib;
|
||||
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]);
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||
|
||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||
|
@ -4340,6 +4473,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
||||
|
||||
|
@ -4350,20 +4484,22 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|||
const uint8_t ls2 = bq2->scales[ib32] >> 4;
|
||||
int sumi1 = 0;
|
||||
for (int l = 0; l < 2; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
|
||||
const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
|
||||
sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1);
|
||||
sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1);
|
||||
q8 += 8;
|
||||
}
|
||||
int sumi2 = 0;
|
||||
for (int l = 2; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
||||
}
|
||||
const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511));
|
||||
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
|
||||
const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]);
|
||||
sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2);
|
||||
sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2);
|
||||
q8 += 8;
|
||||
}
|
||||
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
|
@ -4372,6 +4508,45 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
const uint8_t * q3 = bq2->qs + 8*ib32;
|
||||
const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32;
|
||||
const int8_t * q8 = bq8_1[ib32].qs;
|
||||
uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
int sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0];
|
||||
const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1];
|
||||
const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127));
|
||||
const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]);
|
||||
sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
|
||||
sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
|
||||
q8 += 8;
|
||||
aux32 >>= 7;
|
||||
}
|
||||
const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f;
|
||||
return d * sumi;
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
|
||||
|
@ -5384,27 +5559,37 @@ static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
|
|||
*dsti = *xi;
|
||||
}
|
||||
|
||||
static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
const half * xi = (const half *) cxi;
|
||||
float * dsti = (float *) cdsti;
|
||||
|
||||
*dsti = *xi;
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
// determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int i02 = i / (ne00*ne01);
|
||||
const int i01 = (i - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i12 = i / (ne10*ne11);
|
||||
const int i11 = (i - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13;
|
||||
|
||||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
@ -5498,23 +5683,26 @@ static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
|
|||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13) {
|
||||
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int i02 = i / (ne00*ne01);
|
||||
const int i01 = (i - i02*ne01*ne00) / ne00;
|
||||
const int i00 = (i - i02*ne01*ne00 - i01*ne00);
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
||||
const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
||||
|
||||
const int i12 = i / (ne10*ne11);
|
||||
const int i11 = (i - i12*ne10*ne11) / ne10;
|
||||
const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
|
||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
|
||||
const int i13 = i/(ne10 * ne11 * ne12);
|
||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
||||
const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
||||
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
@ -5683,7 +5871,7 @@ static __global__ void alibi_f32(const float * x, float * dst, const int ncols,
|
|||
}
|
||||
|
||||
static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
|
||||
const int row = blockIdx.y;
|
||||
const int row = blockIdx.x;
|
||||
const int col = threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
@ -5756,7 +5944,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
|||
}
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
|
||||
static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
||||
static __global__ void soft_max_f16(const float * x, const half * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
||||
|
@ -5791,12 +5979,12 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
|||
if (need_check && col_data + 0 >= ncols_data) {
|
||||
val.x = -INFINITY;
|
||||
} else {
|
||||
val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
|
||||
val.x = x[ix + 0]*scale + (y ? __half2float(y[iy + 0]) : 0.0f);
|
||||
}
|
||||
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
||||
val.y = -INFINITY;
|
||||
} else {
|
||||
val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
|
||||
val.y = x[ix + WARP_SIZE]*scale + (y ? __half2float(y[iy + WARP_SIZE]) : 0.0f);
|
||||
}
|
||||
if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
|
||||
vals[col_smem] = val;
|
||||
|
@ -5886,7 +6074,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
|||
}
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
||||
static __global__ void soft_max_f32(const float * x, const half * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
|
@ -5916,7 +6104,7 @@ static __global__ void soft_max_f32(const float * x, const float * y, float * ds
|
|||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
|
||||
const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
|
||||
const float val = x[ix]*scale + (y ? __half2float(y[iy]) : 0.0f);
|
||||
vals[col] = val;
|
||||
max_val = max(max_val, val);
|
||||
}
|
||||
|
@ -6005,9 +6193,10 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
|
|||
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
||||
}
|
||||
|
||||
static __global__ void im2col_f32_f16(
|
||||
const float * x, half * dst,
|
||||
int offset_delta, int IW, int IH, int OW, int KW, int KH, int pelements, int CHW,
|
||||
template <typename T>
|
||||
static __global__ void im2col_kernel(
|
||||
const float * x, T * dst, int batch_offset,
|
||||
int offset_delta, int IC, int IW, int IH, int OH, int OW, int KW, int KH, int pelements, int CHW,
|
||||
int s0, int s1, int p0, int p1, int d0, int d1) {
|
||||
const int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i >= pelements) {
|
||||
|
@ -6020,21 +6209,73 @@ static __global__ void im2col_f32_f16(
|
|||
const int ky = (i - kd) / OW;
|
||||
const int ix = i % OW;
|
||||
|
||||
const int oh = blockIdx.y;
|
||||
const int batch = blockIdx.z / IC;
|
||||
const int ic = blockIdx.z % IC;
|
||||
|
||||
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
||||
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
||||
const int64_t iih = oh * s1 + ky * d1 - p1;
|
||||
|
||||
const int64_t offset_dst =
|
||||
(blockIdx.y * OW + ix) * CHW +
|
||||
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
||||
((batch * OH + oh) * OW + ix) * CHW +
|
||||
(ic * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = __float2half(0.0f);
|
||||
dst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int64_t offset_src = blockIdx.z * offset_delta;
|
||||
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
||||
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Ti, typename To>
|
||||
static __global__ void pool2d_nchw_kernel(
|
||||
const int ih, const int iw, const int oh, const int ow,
|
||||
const int kh, const int kw, const int sh, const int sw,
|
||||
const int ph, const int pw, const int parallel_elements,
|
||||
const Ti* src, To* dst, const enum ggml_op_pool op) {
|
||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (idx >= parallel_elements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int I_HW = ih * iw;
|
||||
const int O_HW = oh * ow;
|
||||
const int nc = idx / O_HW;
|
||||
const int cur_oh = idx % O_HW / ow;
|
||||
const int cur_ow = idx % O_HW % ow;
|
||||
const Ti* i_ptr = src + nc * I_HW;
|
||||
To* o_ptr = dst + nc * O_HW;
|
||||
const int start_h = cur_oh * sh - ph;
|
||||
const int bh = max(0, start_h);
|
||||
const int eh = min(ih, start_h + kh);
|
||||
const int start_w = cur_ow * sw - pw;
|
||||
const int bw = max(0, start_w);
|
||||
const int ew = min(iw, start_w + kw);
|
||||
const To scale = 1. / (kh * kw);
|
||||
To res = 0;
|
||||
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res = 0; break;
|
||||
case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
|
||||
}
|
||||
|
||||
for (int i = bh; i < eh; i += 1) {
|
||||
for (int j = bw; j < ew; j += 1) {
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
Ti cur = __ldg(i_ptr + i * iw + j);
|
||||
#else
|
||||
Ti cur = i_ptr[i * iw + j];
|
||||
#endif
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: res += cur * scale; break;
|
||||
case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
#define CUDA_FLASH_ATTENTION_BLOCK_SIZE 256
|
||||
|
||||
template<int block_size, int k_seq_len, int k_head_dim>
|
||||
|
@ -6771,6 +7012,16 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|||
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
|
||||
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
|
||||
hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
||||
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
|
||||
|
@ -6931,6 +7182,12 @@ static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k,
|
|||
dequantize_block_iq2_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||
|
@ -6968,6 +7225,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
|||
return dequantize_row_iq2_xxs_cuda;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_cuda;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_cuda<float>;
|
||||
default:
|
||||
|
@ -7001,6 +7260,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|||
return dequantize_row_iq2_xxs_cuda;
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
return dequantize_row_iq2_xs_cuda;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return dequantize_row_iq3_xxs_cuda;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_cuda<half>;
|
||||
default:
|
||||
|
@ -7213,6 +7474,15 @@ static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float
|
|||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % QK_K == 0);
|
||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||
const dim3 block_nums(block_num_y, 1, 1);
|
||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||
mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
|
||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||
}
|
||||
|
||||
static void ggml_mul_mat_q4_0_q8_1_cuda(
|
||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||
|
@ -7685,69 +7955,82 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|||
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
||||
}
|
||||
|
||||
|
||||
static void ggml_cpy_f16_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int num_blocks = ne / QK8_0;
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int num_blocks = ne / QK4_0;
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int num_blocks = ne / QK4_1;
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
|
||||
const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
||||
|
@ -7826,7 +8109,7 @@ static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const
|
|||
|
||||
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
const dim3 block_nums(1, nrows, 1);
|
||||
const dim3 block_nums(nrows, 1, 1);
|
||||
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
|
||||
}
|
||||
|
||||
|
@ -7852,7 +8135,7 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
|||
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
||||
}
|
||||
|
||||
static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||
static void soft_max_f16_cuda(const float * x, const half * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||
int nth = WARP_SIZE;
|
||||
while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const dim3 block_dims(nth, 1, 1);
|
||||
|
@ -7895,7 +8178,7 @@ static void soft_max_f16_cuda(const float * x, const float * y, float * dst, con
|
|||
}
|
||||
}
|
||||
|
||||
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||
static void soft_max_f32_cuda(const float * x, const half * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||
int nth = WARP_SIZE;
|
||||
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const dim3 block_dims(nth, 1, 1);
|
||||
|
@ -7938,14 +8221,15 @@ static void soft_max_f32_cuda(const float * x, const float * y, float * dst, con
|
|||
}
|
||||
}
|
||||
|
||||
static void im2col_f32_f16_cuda(const float* x, half* dst,
|
||||
template <typename T>
|
||||
static void im2col_cuda(const float* x, T* dst,
|
||||
int IW, int IH, int OW, int OH, int KW, int KH, int IC,
|
||||
int offset_delta,
|
||||
int batch, int batch_offset, int offset_delta,
|
||||
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
|
||||
const int parallel_elements = OW * KW * KH;
|
||||
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
|
||||
dim3 block_nums(num_blocks, OH, IC);
|
||||
im2col_f32_f16<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
||||
dim3 block_nums(num_blocks, OH, batch * IC);
|
||||
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
||||
}
|
||||
|
||||
static void flash_attn_f32_cuda(const float* q, const float* k,const float* v, float* dst, float kq_scale, const int d_head, const int seq_len, const int num_heads, cudaStream_t stream) {
|
||||
|
@ -8537,6 +8821,34 @@ static void ggml_cuda_op_relu(
|
|||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_hardsigmoid(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_hardswish(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_leaky_relu(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
@ -8770,6 +9082,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
|
@ -8792,6 +9105,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
|
|||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
return max_compute_capability >= CC_VOLTA ? 128 : 64;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return 64;
|
||||
|
@ -8863,6 +9177,9 @@ static void ggml_cuda_op_mul_mat_vec_q(
|
|||
case GGML_TYPE_IQ2_XS:
|
||||
mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
|
@ -9163,13 +9480,46 @@ static void ggml_cuda_op_alibi(
|
|||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_pool2d(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
||||
const int k0 = opts[1];
|
||||
const int k1 = opts[2];
|
||||
const int s0 = opts[3];
|
||||
const int s1 = opts[4];
|
||||
const int p0 = opts[5];
|
||||
const int p1 = opts[6];
|
||||
|
||||
const int64_t IH = src0->ne[1];
|
||||
const int64_t IW = src0->ne[0];
|
||||
|
||||
const int64_t N = dst->ne[3];
|
||||
const int64_t OC = dst->ne[2];
|
||||
const int64_t OH = dst->ne[1];
|
||||
const int64_t OW = dst->ne[0];
|
||||
|
||||
const int parallel_elements = N * OC * OH * OW;
|
||||
const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
|
||||
dim3 block_nums(num_blocks);
|
||||
pool2d_nchw_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, main_stream>>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op);
|
||||
|
||||
(void) src1;
|
||||
(void) src1_dd;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_im2col(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
|
@ -9191,8 +9541,14 @@ static void ggml_cuda_op_im2col(
|
|||
const int64_t OW = dst->ne[1];
|
||||
|
||||
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
const int64_t batch = src1->ne[3];
|
||||
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
|
||||
|
||||
im2col_f32_f16_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
if(dst->type == GGML_TYPE_F16) {
|
||||
im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
} else {
|
||||
im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
}
|
||||
|
||||
(void) src0;
|
||||
(void) src0_dd;
|
||||
|
@ -9261,7 +9617,7 @@ static void ggml_cuda_op_soft_max(
|
|||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16); // src1 contains mask and it is optional
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows_x = ggml_nrows(src0);
|
||||
|
@ -9281,9 +9637,9 @@ static void ggml_cuda_op_soft_max(
|
|||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
||||
|
||||
if (use_f16_soft_max) {
|
||||
soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
soft_max_f16_cuda(src0_dd, src1 ? (const half *) src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
} else {
|
||||
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
soft_max_f32_cuda(src0_dd, src1 ? (const half *) src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
}
|
||||
|
||||
(void) dst;
|
||||
|
@ -9788,6 +10144,13 @@ static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
||||
}
|
||||
|
||||
static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid);
|
||||
}
|
||||
|
||||
static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish);
|
||||
}
|
||||
static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
|
||||
}
|
||||
|
@ -10658,19 +11021,25 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
GGML_ASSERT(src0->ne[3] == 1);
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
//GGML_ASSERT(src0->ne[3] == 1);
|
||||
|
||||
const int64_t nb00 = src0->nb[0];
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
const int64_t nb03 = src0->nb[3];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
GGML_ASSERT(src1->ne[3] == 1);
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
//GGML_ASSERT(src1->ne[3] == 1);
|
||||
|
||||
const int64_t nb10 = src1->nb[0];
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb12 = src1->nb[2];
|
||||
const int64_t nb13 = src1->nb[3];
|
||||
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
@ -10682,17 +11051,19 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|||
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
||||
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
|
||||
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
|
@ -10725,6 +11096,10 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
||||
}
|
||||
|
||||
static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d);
|
||||
}
|
||||
|
||||
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
||||
}
|
||||
|
@ -10826,6 +11201,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|||
case GGML_UNARY_OP_RELU:
|
||||
func = ggml_cuda_relu;
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
func = ggml_cuda_hardsigmoid;
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
func = ggml_cuda_hardswish;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
@ -10900,6 +11281,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
|||
case GGML_OP_IM2COL:
|
||||
func = ggml_cuda_im2col;
|
||||
break;
|
||||
case GGML_OP_POOL_2D:
|
||||
func = ggml_cuda_pool2d;
|
||||
break;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
func = ggml_cuda_sum_rows;
|
||||
break;
|
||||
|
@ -11638,6 +12022,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_TANH:
|
||||
return true;
|
||||
|
@ -11661,7 +12047,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
return false;
|
||||
}
|
||||
|
@ -11705,6 +12091,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
|
@ -11733,6 +12122,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
case GGML_OP_ROPE:
|
||||
case GGML_OP_ALIBI:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
|
|
1990
ggml-kompute.cpp
Normal file
1990
ggml-kompute.cpp
Normal file
File diff suppressed because it is too large
Load diff
46
ggml-kompute.h
Normal file
46
ggml-kompute.h
Normal file
|
@ -0,0 +1,46 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_vk_device {
|
||||
int index;
|
||||
int type; // same as VkPhysicalDeviceType
|
||||
size_t heapSize;
|
||||
const char * name;
|
||||
const char * vendor;
|
||||
int subgroupSize;
|
||||
uint64_t bufferAlignment;
|
||||
uint64_t maxAlloc;
|
||||
};
|
||||
|
||||
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
|
||||
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
|
||||
bool ggml_vk_has_vulkan(void);
|
||||
bool ggml_vk_has_device(void);
|
||||
struct ggml_vk_device ggml_vk_current_device(void);
|
||||
|
||||
//
|
||||
// backend API
|
||||
//
|
||||
|
||||
// forward declaration
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -57,6 +57,9 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(voi
|
|||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
|
||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
104
ggml-metal.m
104
ggml-metal.m
|
@ -60,6 +60,7 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
|
||||
GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
|
||||
GGML_METAL_KERNEL_TYPE_RMS_NORM,
|
||||
GGML_METAL_KERNEL_TYPE_GROUP_NORM,
|
||||
|
@ -81,6 +82,7 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
|
||||
//GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
|
||||
|
@ -98,6 +100,7 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
|
||||
|
@ -112,6 +115,7 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
|
||||
|
@ -126,10 +130,12 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ROPE_F16,
|
||||
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F16,
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
|
@ -169,6 +175,8 @@ struct ggml_metal_context {
|
|||
|
||||
bool support_simdgroup_reduction;
|
||||
bool support_simdgroup_mm;
|
||||
|
||||
bool should_capture_next_compute;
|
||||
};
|
||||
|
||||
// MSL code
|
||||
|
@ -355,6 +363,8 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n", __func__, ctx->support_simdgroup_mm ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
|
||||
ctx->should_capture_next_compute = false;
|
||||
|
||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||
|
@ -431,6 +441,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
|
||||
|
@ -452,6 +463,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
|
||||
|
@ -469,6 +481,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
|
@ -483,6 +496,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
|
@ -497,10 +511,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
|
@ -631,6 +647,10 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|||
case GGML_OP_ALIBI:
|
||||
case GGML_OP_ROPE:
|
||||
case GGML_OP_IM2COL:
|
||||
return true;
|
||||
case GGML_OP_POOL_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
return false;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
@ -693,6 +713,20 @@ static bool ggml_metal_graph_compute(
|
|||
const int n_cb = ctx->n_cb;
|
||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
const bool should_capture = ctx->should_capture_next_compute;
|
||||
if (should_capture) {
|
||||
ctx->should_capture_next_compute = false;
|
||||
|
||||
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
|
||||
descriptor.captureObject = ctx->queue;
|
||||
|
||||
NSError * error = nil;
|
||||
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
||||
GGML_ASSERT(!"capture failed");
|
||||
}
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> command_buffer_builder[n_cb];
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
|
||||
|
@ -701,6 +735,7 @@ static bool ggml_metal_graph_compute(
|
|||
// enqueue the command buffers in order to specify their execution order
|
||||
[command_buffer enqueue];
|
||||
}
|
||||
|
||||
const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
|
||||
|
||||
dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
|
||||
|
@ -747,9 +782,9 @@ static bool ggml_metal_graph_compute(
|
|||
GGML_ASSERT(!"unsupported op");
|
||||
}
|
||||
|
||||
#ifndef GGML_METAL_NDEBUG
|
||||
if (should_capture) {
|
||||
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
|
||||
#endif
|
||||
}
|
||||
|
||||
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
||||
|
@ -1152,6 +1187,8 @@ static bool ggml_metal_graph_compute(
|
|||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16);
|
||||
|
||||
int nth = 32; // SIMD width
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
@ -1276,6 +1313,7 @@ static bool ggml_metal_graph_compute(
|
|||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
||||
}
|
||||
|
||||
|
@ -1404,6 +1442,12 @@ static bool ggml_metal_graph_compute(
|
|||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||
|
@ -1446,6 +1490,11 @@ static bool ggml_metal_graph_compute(
|
|||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_IQ3_XXS) {
|
||||
const int mem_size = 256*4+128;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
|
@ -1540,6 +1589,7 @@ static bool ggml_metal_graph_compute(
|
|||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
|
||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
||||
}
|
||||
|
||||
|
@ -1671,6 +1721,12 @@ static bool ggml_metal_graph_compute(
|
|||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||
|
@ -1729,6 +1785,11 @@ static bool ggml_metal_graph_compute(
|
|||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_IQ3_XXS) {
|
||||
const int mem_size = 256*4+128;
|
||||
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src2t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
|
@ -1769,6 +1830,7 @@ static bool ggml_metal_graph_compute(
|
|||
case GGML_TYPE_Q6_K: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K ].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
|
||||
case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
|
||||
case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||
default: GGML_ASSERT(false && "not implemented");
|
||||
}
|
||||
|
@ -1977,7 +2039,7 @@ static bool ggml_metal_graph_compute(
|
|||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
||||
|
@ -1985,6 +2047,7 @@ static bool ggml_metal_graph_compute(
|
|||
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
||||
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
||||
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
||||
|
||||
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
||||
|
||||
const int32_t N = src1->ne[is_2D ? 3 : 2];
|
||||
|
@ -2005,8 +2068,8 @@ static bool ggml_metal_graph_compute(
|
|||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
};
|
||||
|
@ -2152,6 +2215,10 @@ static bool ggml_metal_graph_compute(
|
|||
|
||||
id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
|
||||
|
||||
GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
||||
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
||||
|
||||
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
||||
const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
||||
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|
||||
|
@ -2299,9 +2366,9 @@ static bool ggml_metal_graph_compute(
|
|||
}
|
||||
}
|
||||
|
||||
#ifndef GGML_METAL_NDEBUG
|
||||
if (should_capture) {
|
||||
[encoder popDebugGroup];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
[encoder endEncoding];
|
||||
|
@ -2323,6 +2390,10 @@ static bool ggml_metal_graph_compute(
|
|||
}
|
||||
}
|
||||
|
||||
if (should_capture) {
|
||||
[[MTLCaptureManager sharedCaptureManager] stopCapture];
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -2498,6 +2569,16 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backen
|
|||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||
size_t max_size = device.maxBufferLength;
|
||||
ggml_backend_metal_free_device();
|
||||
|
||||
return max_size;
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
||||
|
||||
|
@ -2516,7 +2597,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
||||
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
||||
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||
|
@ -2690,6 +2771,13 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
|
|||
return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
|
||||
}
|
||||
|
||||
void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
|
||||
GGML_ASSERT(ggml_backend_is_metal(backend));
|
||||
|
||||
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
||||
ctx->should_capture_next_compute = true;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
|
||||
|
|
349
ggml-metal.metal
349
ggml-metal.metal
|
@ -349,9 +349,9 @@ kernel void kernel_sum_rows(
|
|||
}
|
||||
|
||||
kernel void kernel_soft_max(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
|
@ -366,9 +366,9 @@ kernel void kernel_soft_max(
|
|||
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
||||
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr;
|
||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
device const half * pmask = src1 != src0 ? (device const half *) src1 + i01*ne00 : nullptr;
|
||||
device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
|
||||
// parallel max
|
||||
float lmax = -INFINITY;
|
||||
|
@ -435,9 +435,9 @@ kernel void kernel_soft_max(
|
|||
}
|
||||
|
||||
kernel void kernel_soft_max_4(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
|
@ -452,15 +452,15 @@ kernel void kernel_soft_max_4(
|
|||
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
||||
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
|
||||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
|
||||
device const half4 * pmask = src1 != src0 ? (device const half4 *) src1 + i01*ne00/4 : nullptr;
|
||||
device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
|
||||
|
||||
// parallel max
|
||||
float4 lmax4 = -INFINITY;
|
||||
|
||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4) (pmask ? pmask[i00] : 0.0f));
|
||||
}
|
||||
|
||||
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||
|
@ -486,7 +486,7 @@ kernel void kernel_soft_max_4(
|
|||
// parallel sum
|
||||
float4 lsum4 = 0.0f;
|
||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
||||
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4) (pmask ? pmask[i00] : 0.0f)) - max_val);
|
||||
lsum4 += exp_psrc4;
|
||||
pdst4[i00] = exp_psrc4;
|
||||
}
|
||||
|
@ -1775,9 +1775,29 @@ kernel void kernel_rope(
|
|||
template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
|
||||
template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
|
||||
|
||||
kernel void kernel_im2col_f16(
|
||||
typedef void (im2col_t)(
|
||||
device const float * x,
|
||||
device half * dst,
|
||||
device char * dst,
|
||||
constant int32_t & ofs0,
|
||||
constant int32_t & ofs1,
|
||||
constant int32_t & IW,
|
||||
constant int32_t & IH,
|
||||
constant int32_t & CHW,
|
||||
constant int32_t & s0,
|
||||
constant int32_t & s1,
|
||||
constant int32_t & p0,
|
||||
constant int32_t & p1,
|
||||
constant int32_t & d0,
|
||||
constant int32_t & d1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_im2col(
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
constant int32_t & ofs0,
|
||||
constant int32_t & ofs1,
|
||||
constant int32_t & IW,
|
||||
|
@ -1800,14 +1820,19 @@ kernel void kernel_im2col_f16(
|
|||
(tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
|
||||
(tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
|
||||
|
||||
device T * pdst = (device T *) (dst);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = 0.0f;
|
||||
pdst[offset_dst] = 0.0f;
|
||||
} else {
|
||||
const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
|
||||
dst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
pdst[offset_dst] = x[offset_src + iih * IW + iiw];
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
|
||||
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
|
||||
|
||||
kernel void kernel_upscale_f32(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
|
@ -2084,8 +2109,8 @@ kernel void kernel_flash_attn_ext_f16(
|
|||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
{
|
||||
half S[Q] = { 0.0h };
|
||||
half M[Q] = { -INFINITY };
|
||||
half S[Q] = { [0 ... Q-1] = 0.0h };
|
||||
half M[Q] = { [0 ... Q-1] = -INFINITY };
|
||||
|
||||
// assume K and V are same shape
|
||||
const int64_t ne22 = ne12;
|
||||
|
@ -2119,13 +2144,11 @@ kernel void kernel_flash_attn_ext_f16(
|
|||
}
|
||||
}
|
||||
|
||||
const int64_t ir = iq3*ne02*ne01 + iq2*ne01 + iq1;
|
||||
|
||||
// pointer to the mask
|
||||
device const float * mp = (device const float *) (mask + (ir%ne31)*nb31);
|
||||
device const half * mp = (device const half *) (mask + iq1*nb31);
|
||||
|
||||
// prepare diagonal scale matrix
|
||||
simdgroup_float8x8 mscale(scale);
|
||||
simdgroup_half8x8 mscale(scale);
|
||||
|
||||
// loop over the KV cache
|
||||
// each simdgroup handles blocks of Q rows and C columns
|
||||
|
@ -2151,8 +2174,8 @@ kernel void kernel_flash_attn_ext_f16(
|
|||
|
||||
// mqk = mqk*scale + mask
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
simdgroup_float8x8 mm;
|
||||
simdgroup_load(mm, mp + 8*j*(nb31/sizeof(float)) + ic + 8*cc, nb31/sizeof(float), 0, false);
|
||||
simdgroup_half8x8 mm;
|
||||
simdgroup_load(mm, mp + 8*j*(nb31/sizeof(half)) + ic + 8*cc, nb31/sizeof(half), 0, false);
|
||||
simdgroup_multiply_accumulate(mqk[j], mqk[j], mscale, mm);
|
||||
|
||||
simdgroup_store(mqk[j], ss + 8*j*T + 8*cc, T, 0, false);
|
||||
|
@ -2856,6 +2879,12 @@ typedef struct {
|
|||
} block_iq2_xs;
|
||||
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
||||
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[3*QK_K/8];
|
||||
} block_iq3_xxs;
|
||||
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
void kernel_mul_mv_q2_K_f32_impl(
|
||||
|
@ -4078,6 +4107,42 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
|
|||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
constexpr constant static uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
||||
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
||||
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
||||
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
||||
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
||||
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
||||
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
||||
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
||||
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
||||
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
||||
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
|
||||
constexpr constant static uint8_t ksigns_iq2xs[128] = {
|
||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
||||
|
@ -4367,6 +4432,143 @@ kernel void kernel_mul_mv_iq2_xs_f32(
|
|||
kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
void kernel_mul_mv_iq3_xxs_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_iq3_xxs * x = (device const block_iq3_xxs *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[32];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int nb32 = nb * (QK_K / 32);
|
||||
|
||||
threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values;
|
||||
threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 256);
|
||||
{
|
||||
int nval = 4;
|
||||
int pos = (32*sgitg + tiisg)*nval;
|
||||
for (int i = 0; i < nval; ++i) values[pos + i] = iq3xxs_grid[pos + i];
|
||||
nval = 2;
|
||||
pos = (32*sgitg + tiisg)*nval;
|
||||
for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i];
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
const int ix = tiisg;
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
yl[i] = y4[i];
|
||||
}
|
||||
|
||||
const int ibl = ib32 / (QK_K / 32);
|
||||
const int ib = ib32 % (QK_K / 32);
|
||||
|
||||
device const block_iq3_xxs * xr = x + ibl;
|
||||
device const uint8_t * q3 = xr->qs + 8 * ib;
|
||||
device const uint16_t * gas = (device const uint16_t *)(xr->qs + QK_K/4) + 2 * ib;
|
||||
device const half * dh = &xr->d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
const float db = dh[0];
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float d = db * (0.5f + (aux32 >> 28));
|
||||
|
||||
float2 sum = {0};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + q3[2*l+0]);
|
||||
const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + q3[2*l+1]);
|
||||
const uint8_t signs = shared_signs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sum[0] += yl[8*l + j + 0] * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
sum[1] += yl[8*l + j + 4] * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
sumf[row] += d * (sum[0] + sum[1]);
|
||||
|
||||
dh += nb*sizeof(block_iq3_xxs)/2;
|
||||
q3 += nb*sizeof(block_iq3_xxs);
|
||||
gas += nb*sizeof(block_iq3_xxs)/2;
|
||||
}
|
||||
|
||||
y4 += 32 * 32;
|
||||
}
|
||||
#else
|
||||
// TODO
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_iq3_xxs_f32")]]
|
||||
kernel void kernel_mul_mv_iq3_xxs_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
|
@ -4684,6 +4886,33 @@ void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4
|
|||
}
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x4 & reg) {
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const float d = xb->d;
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
// il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16
|
||||
device const uint8_t * q3 = xb->qs + 8*ib32;
|
||||
device const uint16_t * gas = (device const uint16_t *)(xb->qs + QK_K/4) + 2*ib32;
|
||||
const uint32_t aux32 = gas[0] | (gas[1] << 16);
|
||||
const float dl = d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+0]);
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+1]);
|
||||
uint8_t signs = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[0][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[1][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
grid1 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+2]);
|
||||
grid2 = (constant uint8_t *)(iq3xxs_grid + q3[4*il+3]);
|
||||
signs = ksigns_iq2xs[(aux32 >> (14*il+7)) & 127];
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
reg[2][i] = dl * grid1[i] * (signs & kmask_iq2xs[i+0] ? -1.f : 1.f);
|
||||
reg[3][i] = dl * grid2[i] * (signs & kmask_iq2xs[i+4] ? -1.f : 1.f);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
kernel void kernel_get_rows(
|
||||
device const void * src0,
|
||||
|
@ -5225,6 +5454,7 @@ template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows
|
|||
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
|
@ -5263,6 +5493,7 @@ template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
|
|||
template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
|
@ -5313,6 +5544,7 @@ template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mu
|
|||
template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K, QK_NL, dequantize_q6_K>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs, QK_NL, dequantize_iq2_xs>;
|
||||
template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
|
||||
|
||||
//
|
||||
// matrix-vector multiplication
|
||||
|
@ -6215,3 +6447,68 @@ kernel void kernel_mul_mv_id_iq2_xs_f32(
|
|||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
||||
[[host_name("kernel_mul_mv_id_iq3_xxs_f32")]]
|
||||
kernel void kernel_mul_mv_id_iq3_xxs_f32(
|
||||
device const char * ids,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
constant uint64_t & nbi1,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint64_t & nb1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
constant int & idx,
|
||||
device const char * src00,
|
||||
device const char * src01,
|
||||
device const char * src02,
|
||||
device const char * src03,
|
||||
device const char * src04,
|
||||
device const char * src05,
|
||||
device const char * src06,
|
||||
device const char * src07,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiitg[[thread_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
|
||||
|
||||
const int64_t bid = tgpig.z/(ne12*ne13);
|
||||
|
||||
tgpig.z = tgpig.z%(ne12*ne13);
|
||||
|
||||
const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
|
||||
|
||||
kernel_mul_mv_iq3_xxs_f32_impl(
|
||||
src0[id],
|
||||
(device const float *) (src1 + bid*nb11),
|
||||
dst + bid*ne0,
|
||||
ne00,
|
||||
ne01,
|
||||
ne02,
|
||||
ne10,
|
||||
ne12,
|
||||
ne0,
|
||||
ne1,
|
||||
r2,
|
||||
r3,
|
||||
shared_values,
|
||||
tgpig,
|
||||
tiisg,
|
||||
sgitg);
|
||||
}
|
||||
|
|
|
@ -2125,6 +2125,15 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
|
|||
GGML_UNUSED(buffer_type);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
|
||||
static size_t max_size = -1;
|
||||
if (max_size == (size_t)-1) {
|
||||
ggml_cl_init();
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
|
||||
}
|
||||
return max_size;
|
||||
}
|
||||
|
||||
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
|
||||
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
|
||||
return ggml_backend_is_cpu(backend);
|
||||
|
@ -2136,7 +2145,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
|
|||
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // TODO: return from device info
|
||||
/* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ NULL,
|
||||
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
|
||||
/* .is_host = */ NULL,
|
||||
|
|
721
ggml-quants.c
721
ggml-quants.c
|
@ -3441,6 +3441,41 @@ static const uint64_t iq2xs_grid[512] = {
|
|||
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
||||
};
|
||||
|
||||
static const uint32_t iq3xxs_grid[256] = {
|
||||
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
|
||||
0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
|
||||
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
|
||||
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
|
||||
0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
|
||||
0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
|
||||
0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
|
||||
0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
|
||||
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
|
||||
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
|
||||
0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
|
||||
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
|
||||
0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
|
||||
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
|
||||
0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
|
||||
0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
|
||||
0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
|
||||
0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
|
||||
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
|
||||
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
|
||||
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
|
||||
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
|
||||
0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
|
||||
0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
|
||||
0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
|
||||
0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
|
||||
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
|
||||
0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
|
||||
0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
|
||||
0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
|
||||
0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
|
||||
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
|
||||
};
|
||||
|
||||
static const uint8_t ksigns_iq2xs[128] = {
|
||||
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
||||
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
||||
|
@ -3507,6 +3542,38 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|||
}
|
||||
}
|
||||
|
||||
// ====================== 3.0625 bpw (de)-quantization
|
||||
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
const int nb = k / QK_K;
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d);
|
||||
const uint8_t * qs = x[i].qs;
|
||||
const uint8_t * scales_and_signs = qs + QK_K/4;
|
||||
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, scales_and_signs + 4*ib32, sizeof(uint32_t));
|
||||
const float db = d * (0.5f + (aux32 >> 28)) * 0.5f;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + qs[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + qs[2*l+1]);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = db * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = db * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
y += 8;
|
||||
}
|
||||
qs += 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//===================================== Q8_K ==============================================
|
||||
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
||||
|
@ -8458,17 +8525,36 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|||
|
||||
const __m128i m4 = _mm_set1_epi8(0xf);
|
||||
const __m128i m1 = _mm_set1_epi8(1);
|
||||
const __m128i m511 = _mm_set1_epi16(511);
|
||||
const __m128i m127 = _mm_set1_epi16(127);
|
||||
const __m256i m511 = _mm256_set1_epi16(511);
|
||||
const __m256i mone = _mm256_set1_epi8(1);
|
||||
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
static const uint8_t k_bit_helper[32] = {
|
||||
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||
};
|
||||
static const char block_sign_shuffle_mask_1[32] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
||||
};
|
||||
static const char block_sign_shuffle_mask_2[32] = {
|
||||
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
|
||||
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
|
||||
};
|
||||
static const uint8_t bit_selector_mask_bytes[32] = {
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
||||
};
|
||||
|
||||
const __m256i bit_helper = _mm256_loadu_si256((const __m256i*)k_bit_helper);
|
||||
const __m256i bit_selector_mask = _mm256_loadu_si256((const __m256i*)bit_selector_mask_bytes);
|
||||
const __m256i block_sign_shuffle_1 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_1);
|
||||
const __m256i block_sign_shuffle_2 = _mm256_loadu_si256((const __m256i*)block_sign_shuffle_mask_2);
|
||||
|
||||
uint64_t aux64;
|
||||
|
||||
// somewhat hacky, but gives a significant boost in performance
|
||||
__m128i aux_gindex, aux_sindex;
|
||||
__m256i aux_gindex;
|
||||
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
||||
const uint16_t * sindex = (const uint16_t *)&aux_sindex;
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
@ -8483,26 +8569,68 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|||
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
__m256i sumi2 = _mm256_setzero_si256();
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
|
||||
|
||||
const __m256i q2_data = _mm256_loadu_si256((const __m256i*)q2); q2 += 16;
|
||||
aux_gindex = _mm256_and_si256(q2_data, m511);
|
||||
|
||||
const __m256i partial_sign_bits = _mm256_srli_epi16(q2_data, 9);
|
||||
const __m256i partial_sign_bits_upper = _mm256_srli_epi16(q2_data, 13);
|
||||
const __m256i partial_sign_bits_for_counting = _mm256_xor_si256(partial_sign_bits, partial_sign_bits_upper);
|
||||
|
||||
const __m256i odd_bits = _mm256_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
||||
const __m256i full_sign_bits = _mm256_or_si256(partial_sign_bits, odd_bits);
|
||||
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8;
|
||||
aux_gindex = _mm_and_si128(q2_data, m511);
|
||||
aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
|
||||
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
|
||||
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
|
||||
const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
|
||||
const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
|
||||
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
||||
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
||||
const __m256i q8_3 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q8_4 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
|
||||
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
|
||||
iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
|
||||
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
|
||||
iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
|
||||
const __m256i q2_3 = _mm256_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
|
||||
iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
|
||||
const __m256i q2_4 = _mm256_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
|
||||
iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
|
||||
|
||||
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
||||
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
||||
const __m256i full_signs_1 = _mm256_set_m128i(full_signs_l, full_signs_l);
|
||||
const __m256i full_signs_2 = _mm256_set_m128i(full_signs_h, full_signs_h);
|
||||
|
||||
__m256i signs;
|
||||
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
||||
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, _mm256_or_si256(signs, mone));
|
||||
|
||||
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_2);
|
||||
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, _mm256_or_si256(signs, mone));
|
||||
|
||||
signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_1);
|
||||
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||
const __m256i q8s_3 = _mm256_sign_epi8(q8_3, _mm256_or_si256(signs, mone));
|
||||
|
||||
signs = _mm256_shuffle_epi8(full_signs_2, block_sign_shuffle_2);
|
||||
signs = _mm256_cmpeq_epi8(_mm256_and_si256(signs, bit_selector_mask), bit_selector_mask);
|
||||
const __m256i q8s_4 = _mm256_sign_epi8(q8_4, _mm256_or_si256(signs, mone));
|
||||
|
||||
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
||||
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
||||
const __m256i dot3 = _mm256_maddubs_epi16(q2_3, q8s_3);
|
||||
const __m256i dot4 = _mm256_maddubs_epi16(q2_4, q8s_4);
|
||||
|
||||
const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
|
||||
const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
|
||||
const __m256i sc3 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2)));
|
||||
const __m256i sc4 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3)));
|
||||
|
||||
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
|
||||
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
|
||||
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot3, sc3));
|
||||
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot4, sc4));
|
||||
}
|
||||
|
||||
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
||||
|
@ -8551,6 +8679,136 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|||
#endif
|
||||
}
|
||||
|
||||
// TODO
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||
assert(n % QK_K == 0);
|
||||
|
||||
const block_iq3_xxs * restrict x = vx;
|
||||
const block_q8_K * restrict y = vy;
|
||||
|
||||
const int nb = n / QK_K;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
|
||||
uint32_t aux32[2];
|
||||
|
||||
ggml_int8x16x4_t q3s;
|
||||
ggml_int8x16x4_t q8b;
|
||||
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
float sumf1 = 0, sumf2 = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
||||
const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
|
||||
const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
|
||||
const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
|
||||
const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
|
||||
q3 += 16;
|
||||
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
||||
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
||||
q3s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
|
||||
q3s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
|
||||
q3s.val[0] = vmulq_s8(q3s.val[0], vreinterpretq_s8_u32(aux32x4_0));
|
||||
q3s.val[1] = vmulq_s8(q3s.val[1], vreinterpretq_s8_u32(aux32x4_1));
|
||||
q3s.val[2] = vmulq_s8(q3s.val[2], vreinterpretq_s8_u32(aux32x4_2));
|
||||
q3s.val[3] = vmulq_s8(q3s.val[3], vreinterpretq_s8_u32(aux32x4_3));
|
||||
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]);
|
||||
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]);
|
||||
sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[0] >> 28));
|
||||
sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[1] >> 28));
|
||||
}
|
||||
sumf += d*(sumf1 + sumf2);
|
||||
}
|
||||
*s = 0.5f * sumf;
|
||||
|
||||
#elif defined(__AVX2__)
|
||||
|
||||
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
||||
|
||||
uint32_t aux32[2];
|
||||
|
||||
__m256 accumf = _mm256_setzero_ps();
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
__m256i sumi1 = _mm256_setzero_si256();
|
||||
__m256i sumi2 = _mm256_setzero_si256();
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
||||
const __m256i q2_1 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
||||
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
||||
q3 += 8;
|
||||
const __m256i q2_2 = _mm256_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
|
||||
iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
||||
q3 += 8;
|
||||
memcpy(aux32, gas, 8); gas += 8;
|
||||
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
|
||||
signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
||||
const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
|
||||
signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
||||
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
||||
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
||||
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
||||
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
||||
const uint16_t ls1 = aux32[0] >> 28;
|
||||
const uint16_t ls2 = aux32[1] >> 28;
|
||||
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
||||
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
||||
sumi1 = _mm256_add_epi32(sumi1, p1);
|
||||
sumi2 = _mm256_add_epi32(sumi2, p2);
|
||||
}
|
||||
|
||||
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
||||
|
||||
}
|
||||
|
||||
*s = 0.25f * hsum_float_8(accumf);
|
||||
|
||||
#else
|
||||
|
||||
uint32_t aux32;
|
||||
|
||||
float sumf = 0.f;
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
int32_t bsum = 0;
|
||||
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
||||
memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
|
||||
const uint32_t ls = 2*(aux32 >> 28) + 1;
|
||||
int32_t sumi = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
|
||||
sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
q3 += 8;
|
||||
bsum += sumi * ls;
|
||||
}
|
||||
sumf += d * bsum;
|
||||
}
|
||||
*s = 0.25f * sumf;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ================================ IQ2 quantization =============================================
|
||||
|
||||
typedef struct {
|
||||
|
@ -9189,3 +9447,436 @@ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, i
|
|||
return nrow * nblock * sizeof(block_iq2_xs);
|
||||
}
|
||||
|
||||
//
|
||||
// ============================================= 3-bit using D4 lattice
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
uint32_t * grid;
|
||||
int * map;
|
||||
uint16_t * neighbours;
|
||||
} iq3_entry_t;
|
||||
|
||||
static iq3_entry_t iq3_data[1] = {
|
||||
{NULL, NULL, NULL},
|
||||
};
|
||||
|
||||
static inline int iq3_data_index(int grid_size) {
|
||||
(void)grid_size;
|
||||
GGML_ASSERT(grid_size == 256);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int iq3_compare_func(const void * left, const void * right) {
|
||||
const int * l = (const int *)left;
|
||||
const int * r = (const int *)right;
|
||||
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
||||
}
|
||||
|
||||
void iq3xs_init_impl(int grid_size) {
|
||||
const int gindex = iq3_data_index(grid_size);
|
||||
if (iq3_data[gindex].grid) {
|
||||
return;
|
||||
}
|
||||
static const uint16_t kgrid_256[256] = {
|
||||
0, 2, 4, 9, 11, 15, 16, 18, 25, 34, 59, 61, 65, 67, 72, 74,
|
||||
81, 85, 88, 90, 97, 108, 120, 128, 130, 132, 137, 144, 146, 153, 155, 159,
|
||||
169, 175, 189, 193, 199, 200, 202, 213, 248, 267, 287, 292, 303, 315, 317, 321,
|
||||
327, 346, 362, 413, 436, 456, 460, 462, 483, 497, 513, 515, 520, 522, 529, 531,
|
||||
536, 538, 540, 551, 552, 576, 578, 585, 592, 594, 641, 643, 648, 650, 657, 664,
|
||||
698, 704, 706, 720, 729, 742, 758, 769, 773, 808, 848, 852, 870, 889, 901, 978,
|
||||
992, 1024, 1026, 1033, 1035, 1040, 1042, 1046, 1049, 1058, 1089, 1091, 1093, 1096, 1098, 1105,
|
||||
1112, 1139, 1143, 1144, 1152, 1154, 1161, 1167, 1168, 1170, 1183, 1184, 1197, 1217, 1224, 1228,
|
||||
1272, 1276, 1309, 1323, 1347, 1367, 1377, 1404, 1473, 1475, 1486, 1509, 1537, 1544, 1546, 1553,
|
||||
1555, 1576, 1589, 1594, 1600, 1602, 1616, 1625, 1636, 1638, 1665, 1667, 1672, 1685, 1706, 1722,
|
||||
1737, 1755, 1816, 1831, 1850, 1856, 1862, 1874, 1901, 1932, 1950, 1971, 2011, 2032, 2052, 2063,
|
||||
2077, 2079, 2091, 2095, 2172, 2192, 2207, 2208, 2224, 2230, 2247, 2277, 2308, 2345, 2356, 2389,
|
||||
2403, 2424, 2501, 2504, 2506, 2520, 2570, 2593, 2616, 2624, 2630, 2646, 2669, 2700, 2714, 2746,
|
||||
2754, 2795, 2824, 2835, 2839, 2874, 2882, 2905, 2984, 3028, 3042, 3092, 3108, 3110, 3124, 3153,
|
||||
3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610,
|
||||
3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992,
|
||||
};
|
||||
const int kmap_size = 4096;
|
||||
const int nwant = 2;
|
||||
const uint16_t * kgrid = kgrid_256;
|
||||
uint32_t * kgrid_q3xs;
|
||||
int * kmap_q3xs;
|
||||
uint16_t * kneighbors_q3xs;
|
||||
|
||||
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
||||
uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
|
||||
for (int k = 0; k < grid_size; ++k) {
|
||||
int8_t * pos = (int8_t *)(the_grid + k);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int l = (kgrid[k] >> 3*i) & 0x7;
|
||||
pos[i] = 2*l + 1;
|
||||
}
|
||||
}
|
||||
kgrid_q3xs = the_grid;
|
||||
iq3_data[gindex].grid = the_grid;
|
||||
kmap_q3xs = (int *)malloc(kmap_size*sizeof(int));
|
||||
iq3_data[gindex].map = kmap_q3xs;
|
||||
for (int i = 0; i < kmap_size; ++i) kmap_q3xs[i] = -1;
|
||||
uint32_t aux32;
|
||||
uint8_t * aux8 = (uint8_t *)&aux32;
|
||||
for (int i = 0; i < grid_size; ++i) {
|
||||
aux32 = kgrid_q3xs[i];
|
||||
uint16_t index = 0;
|
||||
for (int k=0; k<4; ++k) {
|
||||
uint16_t q = (aux8[k] - 1)/2;
|
||||
index |= (q << 3*k);
|
||||
}
|
||||
kmap_q3xs[index] = i;
|
||||
}
|
||||
int8_t pos[4];
|
||||
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
||||
int num_neighbors = 0, num_not_in_map = 0;
|
||||
for (int i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q3xs[i] >= 0) continue;
|
||||
++num_not_in_map;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int l = (i >> 3*k) & 0x7;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
||||
int n = 0; int d2 = dist2[0];
|
||||
int nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
++n;
|
||||
}
|
||||
num_neighbors += n;
|
||||
}
|
||||
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
||||
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
||||
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
||||
int counter = 0;
|
||||
for (int i = 0; i < kmap_size; ++i) {
|
||||
if (kmap_q3xs[i] >= 0) continue;
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int l = (i >> 3*k) & 0x7;
|
||||
pos[k] = 2*l + 1;
|
||||
}
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + j);
|
||||
int d2 = 0;
|
||||
for (int k = 0; k < 4; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
||||
dist2[2*j+0] = d2;
|
||||
dist2[2*j+1] = j;
|
||||
}
|
||||
qsort(dist2, grid_size, 2*sizeof(int), iq3_compare_func);
|
||||
kmap_q3xs[i] = -(counter + 1);
|
||||
int d2 = dist2[0];
|
||||
uint16_t * start = &kneighbors_q3xs[counter++];
|
||||
int n = 0, nhave = 1;
|
||||
for (int j = 0; j < grid_size; ++j) {
|
||||
if (dist2[2*j] > d2) {
|
||||
if (nhave == nwant) break;
|
||||
d2 = dist2[2*j];
|
||||
++nhave;
|
||||
}
|
||||
kneighbors_q3xs[counter++] = dist2[2*j+1];
|
||||
++n;
|
||||
}
|
||||
*start = n;
|
||||
}
|
||||
free(dist2);
|
||||
}
|
||||
|
||||
void iq3xs_free_impl(int grid_size) {
|
||||
GGML_ASSERT(grid_size == 256);
|
||||
const int gindex = iq3_data_index(grid_size);
|
||||
if (iq3_data[gindex].grid) {
|
||||
free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL;
|
||||
free(iq3_data[gindex].map); iq3_data[gindex].map = NULL;
|
||||
free(iq3_data[gindex].neighbours); iq3_data[gindex].neighbours = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
|
||||
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
||||
int num_neighbors = neighbours[0];
|
||||
GGML_ASSERT(num_neighbors > 0);
|
||||
float best_d2 = FLT_MAX;
|
||||
int grid_index = -1;
|
||||
for (int j = 1; j <= num_neighbors; ++j) {
|
||||
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
||||
float d2 = 0;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
float q = pg[i];
|
||||
float diff = scale*q - xval[i];
|
||||
d2 += weight[i]*diff*diff;
|
||||
}
|
||||
if (d2 < best_d2) {
|
||||
best_d2 = d2; grid_index = neighbours[j];
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(grid_index >= 0);
|
||||
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
||||
for (int i = 0; i < 4; ++i) L[i] = (pg[i] - 1)/2;
|
||||
return grid_index;
|
||||
}
|
||||
|
||||
static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
||||
|
||||
const int gindex = iq3_data_index(256);
|
||||
|
||||
const uint32_t * kgrid_q3xs = iq3_data[gindex].grid;
|
||||
const int * kmap_q3xs = iq3_data[gindex].map;
|
||||
const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours;
|
||||
|
||||
//GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||
GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int kMaxQ = 8;
|
||||
|
||||
const int nbl = n/256;
|
||||
|
||||
block_iq3_xxs * y = vy;
|
||||
|
||||
float scales[QK_K/32];
|
||||
float weight[32];
|
||||
float xval[32];
|
||||
int8_t L[32];
|
||||
int8_t Laux[32];
|
||||
float waux[32];
|
||||
bool is_on_grid[8];
|
||||
bool is_on_grid_aux[8];
|
||||
uint8_t block_signs[8];
|
||||
uint8_t q3[3*(QK_K/8)];
|
||||
uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4);
|
||||
|
||||
for (int ibl = 0; ibl < nbl; ++ibl) {
|
||||
|
||||
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
||||
memset(q3, 0, 3*QK_K/8);
|
||||
|
||||
float max_scale = 0;
|
||||
|
||||
const float * xbl = x + QK_K*ibl;
|
||||
float sumx2 = 0;
|
||||
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
||||
float sigma2 = sumx2/QK_K;
|
||||
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
const float * xb = xbl + 32*ib;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
||||
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||
} else {
|
||||
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
||||
}
|
||||
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
||||
for (int k = 0; k < 4; ++k) {
|
||||
int nflip = 0;
|
||||
uint8_t s = 0;
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
||||
else {
|
||||
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
||||
}
|
||||
}
|
||||
if (nflip%2) {
|
||||
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
||||
for (int i = 1; i < 8; ++i) {
|
||||
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
||||
if (ax < min) {
|
||||
min = ax; imin = i;
|
||||
}
|
||||
}
|
||||
xval[8*k+imin] = -xval[8*k+imin];
|
||||
s ^= (1 << imin);
|
||||
}
|
||||
block_signs[k] = s & 127;
|
||||
}
|
||||
float max = xval[0];
|
||||
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
||||
if (!max) {
|
||||
scales[ib] = 0;
|
||||
memset(L, 0, 32);
|
||||
continue;
|
||||
}
|
||||
float best = 0;
|
||||
float scale = max/(2*kMaxQ-1);
|
||||
for (int is = -15; is <= 15; ++is) {
|
||||
float id = (2*kMaxQ-1+is*0.2f)/max;
|
||||
float this_scale = 1/id;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
||||
Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
||||
}
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i);
|
||||
int grid_index = kmap_q3xs[u];
|
||||
is_on_grid_aux[k] = true;
|
||||
if (grid_index < 0) {
|
||||
is_on_grid_aux[k] = false;
|
||||
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
||||
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k);
|
||||
}
|
||||
}
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
float w = weight[i];
|
||||
float q = 2*Laux[i] + 1;
|
||||
sumqx += w*xval[i]*q;
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
||||
scale = sumqx/sumq2; best = scale*sumqx;
|
||||
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
||||
for (int k = 0; k < 8; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
||||
}
|
||||
}
|
||||
int n_not_ongrid = 0;
|
||||
for (int k = 0; k < 8; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
||||
if (n_not_ongrid > 0 && scale > 0) {
|
||||
float id = 1/scale;
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
if (is_on_grid[k]) continue;
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
int l = nearest_int(0.5f*(id*xval[4*k+i]-1));
|
||||
l = MAX(0, MIN(kMaxQ-1, l));
|
||||
u |= (l << 3*i);
|
||||
}
|
||||
int grid_index = kmap_q3xs[u];
|
||||
if (grid_index < 0) {
|
||||
const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1;
|
||||
grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k);
|
||||
}
|
||||
const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index);
|
||||
for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2;
|
||||
}
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
float w = weight[i];
|
||||
float q = 2*L[i] + 1;
|
||||
sumqx += w*xval[i]*q;
|
||||
sumq2 += w*q*q;
|
||||
}
|
||||
if (sumq2 > 0) scale = sumqx/sumq2;
|
||||
}
|
||||
if (scale < 0) {
|
||||
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
||||
// and correspondingly flip quant signs.
|
||||
scale = -scale;
|
||||
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
||||
}
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
uint16_t u = 0;
|
||||
for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i);
|
||||
int grid_index = kmap_q3xs[u];
|
||||
if (grid_index < 0) {
|
||||
printf("Oops: found point %u not on grid:", u);
|
||||
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
|
||||
printf("\n");
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
q3[8*ib+k] = grid_index;
|
||||
}
|
||||
scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21);
|
||||
GGML_ASSERT(scale >= 0);
|
||||
scales[ib] = scale;
|
||||
max_scale = MAX(max_scale, scale);
|
||||
}
|
||||
|
||||
if (!max_scale) {
|
||||
memset(y[ibl].qs, 0, 3*QK_K/8);
|
||||
continue;
|
||||
}
|
||||
|
||||
float d = max_scale/31;
|
||||
y[ibl].d = GGML_FP32_TO_FP16(d);
|
||||
float id = 1/d;
|
||||
float sumqx = 0, sumq2 = 0;
|
||||
for (int ib = 0; ib < QK_K/32; ++ib) {
|
||||
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
||||
l = MAX(0, MIN(15, l));
|
||||
scales_and_signs[ib] |= ((uint32_t)l << 28);
|
||||
if (false) {
|
||||
const float * xb = xbl + 32*ib;
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
||||
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
||||
} else {
|
||||
for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i];
|
||||
}
|
||||
const float db = 0.25f * d * (1 + 2*l);
|
||||
for (int k = 0; k < 8; ++k) {
|
||||
const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2);
|
||||
const float * xk = xb + 4*k;
|
||||
const float * wk = weight + 4*k;
|
||||
//const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]);
|
||||
const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]);
|
||||
float best_mse = 0; int best_index = q3[8*ib+k];
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float diff = db * grid[j] * signs[j] - xk[j];
|
||||
best_mse += wk[j] * diff * diff;
|
||||
}
|
||||
for (int idx = 0; idx < 256; ++idx) {
|
||||
//grid = (const uint8_t *)(kgrid_q3xs + idx);
|
||||
grid = (const uint8_t *)(iq3xxs_grid + idx);
|
||||
float mse = 0;
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float diff = db * grid[j] * signs[j] - xk[j];
|
||||
mse += wk[j] * diff * diff;
|
||||
}
|
||||
if (mse < best_mse) {
|
||||
best_mse = mse; best_index = idx;
|
||||
}
|
||||
}
|
||||
q3[8*ib+k] = best_index;
|
||||
//grid = (const uint8_t *)(kgrid_q3xs + best_index);
|
||||
grid = (const uint8_t *)(iq3xxs_grid + best_index);
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
float q = db * grid[j] * signs[j];
|
||||
sumqx += wk[j] * q * xk[j];
|
||||
sumq2 += wk[j] * q * q;
|
||||
}
|
||||
}
|
||||
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
||||
}
|
||||
}
|
||||
memcpy(y[ibl].qs, q3, 3*QK_K/8);
|
||||
}
|
||||
}
|
||||
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||
int nblock = n_per_row/QK_K;
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights);
|
||||
src += n_per_row;
|
||||
qrow += nblock*sizeof(block_iq3_xxs);
|
||||
}
|
||||
return nrow * nblock * sizeof(block_iq3_xxs);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
block_iq3_xxs * restrict y = vy;
|
||||
quantize_row_iq3_xxs_reference(x, y, k);
|
||||
}
|
||||
|
||||
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) {
|
||||
assert(k % QK_K == 0);
|
||||
quantize_row_iq3_xxs_impl(x, y, k, NULL);
|
||||
}
|
||||
|
|
|
@ -166,7 +166,7 @@ typedef struct {
|
|||
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
||||
|
||||
// (Almost) "true" 2-bit quantization.
|
||||
// Due to the need to use blocks as per ggml dsign, it ends up using
|
||||
// Due to the need to use blocks as per ggml design, it ends up using
|
||||
// 2.0625 bpw because of the 16-bit scale for each block of 256.
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
|
@ -182,6 +182,15 @@ typedef struct {
|
|||
} block_iq2_xs;
|
||||
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
||||
|
||||
// (Almost) "true" 3-bit quantization.
|
||||
// Due to the need to use blocks as per ggml design, it ends up using
|
||||
// 3.0625 bpw because of the 16-bit scale for each block of 256.
|
||||
typedef struct {
|
||||
ggml_fp16_t d;
|
||||
uint8_t qs[3*QK_K/8];
|
||||
} block_iq3_xxs;
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
||||
|
@ -196,6 +205,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
||||
void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k);
|
||||
|
||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
||||
|
@ -210,6 +220,7 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
|||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
||||
void quantize_row_iq3_xxs(const float * restrict x, void * restrict y, int k);
|
||||
|
||||
// Dequantization
|
||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
||||
|
@ -227,6 +238,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
|
|||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k);
|
||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
@ -242,12 +254,14 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
|||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
//
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
//
|
||||
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
@ -260,3 +274,5 @@ size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row,
|
|||
|
||||
void iq2xs_init_impl(int grid_size);
|
||||
void iq2xs_free_impl(int grid_size);
|
||||
void iq3xs_init_impl(int grid_size);
|
||||
void iq3xs_free_impl(int grid_size);
|
||||
|
|
|
@ -1,7 +1,14 @@
|
|||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
//
|
||||
// MIT license
|
||||
// Copyright (C) 2024 Intel Corporation
|
||||
// SPDX-License-Identifier: MIT
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
437
ggml-vulkan.cpp
437
ggml-vulkan.cpp
|
@ -116,7 +116,7 @@ struct vk_device {
|
|||
vk_queue transfer_queue;
|
||||
uint32_t descriptor_set_mode;
|
||||
uint32_t subgroup_size;
|
||||
bool is_igpu;
|
||||
bool uma;
|
||||
};
|
||||
|
||||
struct vk_op_push_constants {
|
||||
|
@ -675,7 +675,7 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
|||
|
||||
vk::PhysicalDeviceMemoryProperties mem_props = vk_device.physical_device.getMemoryProperties();
|
||||
|
||||
uint32_t memory_type_index = uint32_t(~0);
|
||||
uint32_t memory_type_index = UINT32_MAX;
|
||||
|
||||
for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) {
|
||||
vk::MemoryType memory_type = mem_props.memoryTypes[i];
|
||||
|
@ -685,7 +685,18 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
|||
}
|
||||
}
|
||||
|
||||
if (memory_type_index >= mem_props.memoryTypeCount) {
|
||||
throw vk::OutOfDeviceMemoryError("No suitable memory type found");
|
||||
}
|
||||
|
||||
try {
|
||||
buf.device_memory = vk_device.device.allocateMemory({ mem_req.size, memory_type_index });
|
||||
} catch (const vk::SystemError& e) {
|
||||
// Out of Host/Device memory, clean up buffer
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
buf.size = 0;
|
||||
throw e;
|
||||
}
|
||||
buf.memory_property_flags = req_flags;
|
||||
buf.ptr = nullptr;
|
||||
|
||||
|
@ -700,6 +711,47 @@ static vk_buffer ggml_vk_create_buffer(size_t size, vk::MemoryPropertyFlags req_
|
|||
return buf;
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer_check(size_t size, vk::MemoryPropertyFlags req_flags) {
|
||||
try {
|
||||
return ggml_vk_create_buffer(size, req_flags);
|
||||
} catch (const vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
static vk_buffer ggml_vk_create_buffer_device(size_t size) {
|
||||
vk_buffer buf;
|
||||
try {
|
||||
buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
} catch (const vk::SystemError& e) {
|
||||
if (vk_device.uma) {
|
||||
// Fall back to host memory type
|
||||
buf = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
} else {
|
||||
std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||
if (buf.size == 0) {
|
||||
return;
|
||||
}
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl;
|
||||
#endif
|
||||
|
||||
buf.size = 0;
|
||||
vk_device.device.freeMemory(buf.device_memory);
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
}
|
||||
|
||||
static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
||||
return { buf, 0, VK_WHOLE_SIZE };
|
||||
}
|
||||
|
@ -738,19 +790,6 @@ static void ggml_vk_wait_events(vk::CommandBuffer& cmd_buffer, std::vector<vk::E
|
|||
);
|
||||
}
|
||||
|
||||
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||
if (buf.size == 0) {
|
||||
return;
|
||||
}
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_destroy_buffer(" << buf.size << ")" << std::endl;
|
||||
#endif
|
||||
|
||||
buf.size = 0;
|
||||
vk_device.device.freeMemory(buf.device_memory);
|
||||
vk_device.device.destroyBuffer(buf.buffer);
|
||||
}
|
||||
|
||||
static bool ggml_vk_build_shader(ggml_type type) {
|
||||
switch(type) {
|
||||
case GGML_TYPE_F16:
|
||||
|
@ -778,7 +817,7 @@ static void ggml_vk_load_shaders() {
|
|||
// mulmat
|
||||
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 8, 32, 32, 2, 2, 2, vk_device.subgroup_size };
|
||||
std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, vk_device.subgroup_size };
|
||||
|
||||
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
|
||||
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
|
||||
|
@ -1015,7 +1054,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
|||
|
||||
vk_device.vendor_id = vk_device.properties.vendorID;
|
||||
vk_device.subgroup_size = subgroup_props.subgroupSize;
|
||||
vk_device.is_igpu = vk_device.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
vk_device.uma = vk_device.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
|
||||
|
||||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
|
@ -1088,7 +1127,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
|||
if (vk_device.fp16) {
|
||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||
}
|
||||
std::cerr << "ggml_vulkan: Using " << vk_device.properties.deviceName << " | fp16: " << vk_device.fp16 << " | warp size: " << vk_device.subgroup_size << std::endl;
|
||||
std::cerr << "ggml_vulkan: Using " << vk_device.properties.deviceName << " | uma: " << vk_device.uma << " | fp16: " << vk_device.fp16 << " | warp size: " << vk_device.subgroup_size << std::endl;
|
||||
device_create_info = {
|
||||
vk::DeviceCreateFlags(),
|
||||
device_queue_create_infos,
|
||||
|
@ -1210,7 +1249,7 @@ static vk_buffer ggml_vk_pool_malloc(size_t size) {
|
|||
ggml_vk_destroy_buffer(b);
|
||||
}
|
||||
|
||||
return ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
return ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_free(vk_buffer& buffer) {
|
||||
|
@ -1250,10 +1289,6 @@ static void * ggml_vk_host_malloc(size_t size) {
|
|||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl;
|
||||
#endif
|
||||
if (getenv("GGML_VK_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
vk_buffer buf = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
|
||||
if(!(buf.memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
|
||||
|
@ -1298,6 +1333,20 @@ static void ggml_vk_host_free(void* ptr) {
|
|||
vk_pinned_memory.erase(vk_pinned_memory.begin() + index);
|
||||
}
|
||||
|
||||
static void ggml_vk_host_get(const void * ptr, vk_buffer *& buf, size_t& buf_offset) {
|
||||
buf = nullptr;
|
||||
buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (ptr >= addr && ptr < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)ptr) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static vk_submission ggml_vk_begin_submission(vk_queue& q, bool one_time = true) {
|
||||
vk_submission s;
|
||||
s.buffer = ggml_vk_create_cmd_buffer(q);
|
||||
|
@ -1384,6 +1433,13 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect
|
|||
}
|
||||
}
|
||||
|
||||
static void ensure_sync_staging_buffer(size_t size) {
|
||||
if (vk_sync_staging.size < size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer_check(size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size_t offset, const ggml_tensor * tensor, bool sync_staging = false) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_nc_async(" << tensor << ")" << std::endl;
|
||||
|
@ -1391,21 +1447,13 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size
|
|||
GGML_ASSERT(!ggml_is_contiguous(tensor));
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer* buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (tensor->data >= addr && tensor->data < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)tensor->data) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(tensor->data, buf, buf_offset);
|
||||
|
||||
const uint64_t ne0 = tensor->ne[0];
|
||||
const uint64_t ne1 = tensor->ne[1];
|
||||
|
@ -1463,10 +1511,7 @@ static void ggml_vk_buffer_write_nc_async(vk_context * ctx, vk_buffer* dst, size
|
|||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
// Create temporary larger buffer
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
staging_offset = 0;
|
||||
|
@ -1512,17 +1557,9 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size
|
|||
GGML_ASSERT(false);
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer* buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (src >= addr && src < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)src) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(src, buf, buf_offset);
|
||||
|
||||
if (buf != nullptr) {
|
||||
// Memory is pinned, use as staging buffer
|
||||
|
@ -1555,10 +1592,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * ctx, vk_buffer* dst, size
|
|||
const size_t copy_size = width*height;
|
||||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
staging_offset = 0;
|
||||
|
@ -1633,17 +1667,9 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_
|
|||
GGML_ASSERT(height > 0);
|
||||
GGML_ASSERT(src->size > 0);
|
||||
// Check if dst is pinned memory
|
||||
vk_buffer* buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (dst >= addr && dst < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)dst) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
vk_buffer * buf = nullptr;
|
||||
size_t buf_offset;
|
||||
ggml_vk_host_get(dst, buf, buf_offset);
|
||||
|
||||
std::vector<vk::BufferCopy> slices(1);
|
||||
if (width == spitch && width == dpitch) {
|
||||
|
@ -1677,10 +1703,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * ctx, vk_buffer* src, size_
|
|||
if (vk_staging.size < vk_staging_offset + copy_size) {
|
||||
if (sync_staging) {
|
||||
// Create temporary larger buffer
|
||||
if (vk_sync_staging.size < copy_size) {
|
||||
ggml_vk_destroy_buffer(vk_sync_staging);
|
||||
vk_sync_staging = ggml_vk_create_buffer(copy_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
ensure_sync_staging_buffer(copy_size);
|
||||
|
||||
staging = &vk_sync_staging;
|
||||
} else {
|
||||
|
@ -1819,7 +1842,7 @@ static void ggml_vk_d2h_tensor_2d(vk_context * ctx, vk_buffer * src, size_t offs
|
|||
|
||||
static uint32_t ggml_vk_guess_split_k(int m, int n, int k) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ", " << aligned << ")";
|
||||
std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")";
|
||||
#endif
|
||||
if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) {
|
||||
#ifdef VK_DEBUG
|
||||
|
@ -2003,8 +2026,27 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co
|
|||
const uint64_t r2 = ne12 / ne02;
|
||||
const uint64_t r3 = ne13 / ne03;
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset);
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src0_uma = d_Qx != nullptr;
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
|
@ -2034,32 +2076,24 @@ static void ggml_vk_mul_mat_q_f16(vk_context * ctx, const ggml_tensor * src0, co
|
|||
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
|
||||
vk_buffer * d_Qx;
|
||||
uint64_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
vk_buffer* d_X;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (load_x) {
|
||||
d_Qx = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if (!src0_uma) {
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
qx_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
}
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if (!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qy != nullptr);
|
||||
|
@ -2178,8 +2212,27 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0
|
|||
const uint64_t r2 = ne12 / ne02;
|
||||
const uint64_t r3 = ne13 / ne03;
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qx = nullptr;
|
||||
size_t qx_buf_offset = 0;
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_Qx, qx_buf_offset);
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src0_uma = d_Qx != nullptr;
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
||||
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
||||
|
@ -2199,31 +2252,23 @@ static void ggml_vk_mul_mat_vec_q_f16(vk_context * ctx, const ggml_tensor * src0
|
|||
const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
uint32_t qx_buf_offset = 0;
|
||||
vk_buffer* d_Qy;
|
||||
uint32_t qy_buf_offset = 0;
|
||||
vk_buffer* d_X;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (load_x) {
|
||||
d_Qx = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if(!src1_uma) {
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
qx_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
}
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if(!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qy != nullptr);
|
||||
|
@ -2345,7 +2390,21 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor
|
|||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const uint64_t x_ne = ne00 * ne01 * ne02;
|
||||
const uint64_t y_ne = ne10 * ne11 * ne12;
|
||||
|
@ -2355,22 +2414,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(vk_context * ctx, const ggml_tensor
|
|||
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
vk_buffer* d_Qx = &extra_src0->buffer_gpu;
|
||||
const uint64_t qx_buf_offset = extra_src0->offset;
|
||||
vk_buffer* d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
} else {
|
||||
} else if (!src1_uma) {
|
||||
d_Qy = &extra_src1->buffer_gpu;
|
||||
qy_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
|
@ -2430,7 +2482,21 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor *
|
|||
|
||||
GGML_ASSERT(ne11 == 1);
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer * d_Qy = nullptr;
|
||||
size_t qy_buf_offset = 0;
|
||||
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src1->data, d_Qy, qy_buf_offset);
|
||||
src1_uma = d_Qy != nullptr;
|
||||
}
|
||||
|
||||
const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
const uint64_t d_ne = ne01 * ne11 * ne12;
|
||||
|
||||
|
@ -2441,18 +2507,11 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(vk_context * ctx, const ggml_tensor *
|
|||
const uint64_t qy_sz = ggml_nbytes(src1);
|
||||
const uint64_t d_sz = sizeof(float) * d_ne;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra;
|
||||
|
||||
vk_buffer* d_D = &extra->buffer_gpu;
|
||||
const uint64_t d_buf_offset = extra->offset;
|
||||
GGML_ASSERT(d_D != nullptr);
|
||||
vk_buffer* d_Qx;
|
||||
vk_buffer* d_Qx = &extra_src0->buffer_gpu;
|
||||
const uint64_t qx_buf_offset = extra_src0->offset;
|
||||
vk_buffer* d_Qy;
|
||||
uint64_t qy_buf_offset = 0;
|
||||
d_Qx = &extra_src0->buffer_gpu;
|
||||
GGML_ASSERT(d_Qx != nullptr);
|
||||
if (load_y) {
|
||||
d_Qy = &vk_prealloc_qy;
|
||||
|
@ -2709,7 +2768,8 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
|||
}
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
|
||||
static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name);
|
||||
static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor);
|
||||
#endif
|
||||
|
||||
template<typename PC>
|
||||
|
@ -2758,17 +2818,34 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
|||
return;
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU;
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
|
||||
vk_buffer * d_X = nullptr;
|
||||
size_t x_buf_offset = 0;
|
||||
vk_buffer * d_Y = nullptr;
|
||||
size_t y_buf_offset = 0;
|
||||
|
||||
bool src0_uma = false;
|
||||
bool src1_uma = false;
|
||||
|
||||
if (vk_device.uma) {
|
||||
ggml_vk_host_get(src0->data, d_X, x_buf_offset);
|
||||
src0_uma = d_X != nullptr;
|
||||
if (use_src1) {
|
||||
ggml_vk_host_get(src1->data, d_Y, y_buf_offset);
|
||||
src1_uma = d_Y != nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
|
||||
|
||||
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, vk_device.properties.limits.minStorageBufferOffsetAlignment);
|
||||
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, vk_device.properties.limits.minStorageBufferOffsetAlignment) : 0;
|
||||
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
||||
ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||
|
||||
// Workaround for tiny tensor inputs on ROPE
|
||||
if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > extra_src1->buffer_gpu.size) {
|
||||
y_sz = VK_WHOLE_SIZE;
|
||||
|
@ -2778,20 +2855,16 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
|||
GGML_ASSERT(d_D != nullptr);
|
||||
uint64_t d_buf_offset = (extra->offset / vk_device.properties.limits.minStorageBufferOffsetAlignment) * vk_device.properties.limits.minStorageBufferOffsetAlignment;
|
||||
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
||||
vk_buffer* d_X = nullptr;
|
||||
uint64_t x_buf_offset = 0;
|
||||
vk_buffer* d_Y = nullptr;
|
||||
uint64_t y_buf_offset = 0;
|
||||
if (transfer_src0) {
|
||||
d_X = &vk_prealloc_qx;
|
||||
} else {
|
||||
} else if(!src0_uma) {
|
||||
d_X = &extra_src0->buffer_gpu;
|
||||
x_buf_offset = extra_src0->offset;
|
||||
GGML_ASSERT(d_X != nullptr);
|
||||
}
|
||||
if (transfer_src1) {
|
||||
d_Y = &vk_prealloc_qy;
|
||||
} else if (use_src1) {
|
||||
} else if (use_src1 && !src1_uma) {
|
||||
d_Y = &extra_src1->buffer_gpu;
|
||||
y_buf_offset = extra_src1->offset;
|
||||
GGML_ASSERT(d_Y != nullptr);
|
||||
|
@ -2800,7 +2873,8 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
|
|||
if (op == GGML_OP_CPY) {
|
||||
GGML_ASSERT(!transfer_src0);
|
||||
GGML_ASSERT(!transfer_src1);
|
||||
d_sz = dst->ne[1] * dst->nb[1];
|
||||
x_sz = ggml_nbytes(src0);
|
||||
d_sz = ggml_nbytes(dst);
|
||||
|
||||
if (extra->offset + d_sz >= d_D->size) {
|
||||
d_sz = VK_WHOLE_SIZE;
|
||||
|
@ -3148,13 +3222,13 @@ static void ggml_vk_test_matmul(size_t m, size_t n, size_t k, size_t batch, size
|
|||
if (vk_prealloc_split_k.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_split_k);
|
||||
}
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer_check(sizeof(float) * d_ne * split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
}
|
||||
}
|
||||
|
||||
vk_buffer d_X = ggml_vk_create_buffer(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_X = ggml_vk_create_buffer_check(sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_Y = ggml_vk_create_buffer_check(sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer d_D = ggml_vk_create_buffer_check(sizeof(float) * d_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
X_TYPE* x = (X_TYPE *) malloc(sizeof(X_TYPE) * x_ne);
|
||||
Y_TYPE* y = (Y_TYPE *) malloc(sizeof(Y_TYPE) * y_ne);
|
||||
|
@ -3315,6 +3389,10 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
|||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
i0 = std::max(i0, 5);
|
||||
i1 = std::max(i1, 5);
|
||||
i2 = std::max(i2, 0);
|
||||
i3 = std::max(i3, 0);
|
||||
fprintf(stderr, " ");
|
||||
for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
|
||||
fprintf(stderr, "%7d ", idx1);
|
||||
|
@ -3376,7 +3454,7 @@ static void ggml_vk_test_h2d_nc(size_t ne0, size_t ne1, size_t ne2, size_t ne3)
|
|||
vk_context * ctx = ggml_vk_create_context(vk_device.compute_queue);
|
||||
ggml_vk_ctx_begin(ctx);
|
||||
|
||||
vk_buffer buffer = ggml_vk_create_buffer(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer buffer = ggml_vk_create_buffer_check(ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
ggml_vk_h2d_tensor_2d(ctx, &buffer, 0, tensor, 0, 0, ggml_nrows(tensor));
|
||||
|
||||
|
@ -3439,7 +3517,7 @@ static void ggml_vk_test_transfer(size_t ne, bool pinned) {
|
|||
std::cerr << "ggml_vk_test_transfer(" << ne << ")" << std::endl;
|
||||
#endif
|
||||
// Check transfers are correct
|
||||
vk_buffer buffer = ggml_vk_create_buffer(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer buffer = ggml_vk_create_buffer_check(sizeof(float) * ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
|
||||
float * x;
|
||||
float * y;
|
||||
|
@ -3666,7 +3744,7 @@ void ggml_vk_preallocate_buffers() {
|
|||
std::cerr << "qx_size: " << vk_prealloc_size_qx << " qy_size: " << vk_prealloc_size_qy << " x_size: " << vk_prealloc_size_x << " y_size: " << vk_prealloc_size_y << " split_k_size: " << vk_prealloc_size_split_k << std::endl;
|
||||
#endif
|
||||
#if defined(VK_RUN_TESTS)
|
||||
vk_staging = ggml_vk_create_buffer(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
vk_staging = ggml_vk_create_buffer_check(100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
ggml_vk_test_transfer(8192 * 1000, false);
|
||||
ggml_vk_test_transfer(8192 * 1000, true);
|
||||
|
||||
|
@ -3712,42 +3790,42 @@ void ggml_vk_preallocate_buffers() {
|
|||
if (vk_prealloc_qx.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_qx);
|
||||
}
|
||||
vk_prealloc_qx = ggml_vk_create_buffer(vk_prealloc_size_qx, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_qx = ggml_vk_create_buffer_device(vk_prealloc_size_qx);
|
||||
}
|
||||
if (vk_prealloc_size_qy > 0 && vk_prealloc_qy.size < vk_prealloc_size_qy) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_qy.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_qy);
|
||||
}
|
||||
vk_prealloc_qy = ggml_vk_create_buffer(vk_prealloc_size_qy, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_qy = ggml_vk_create_buffer_device(vk_prealloc_size_qy);
|
||||
}
|
||||
if (vk_prealloc_size_x > 0 && vk_prealloc_x.size < vk_prealloc_size_x) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_x.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_x);
|
||||
}
|
||||
vk_prealloc_x = ggml_vk_create_buffer(vk_prealloc_size_x, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_x = ggml_vk_create_buffer_device(vk_prealloc_size_x);
|
||||
}
|
||||
if (vk_prealloc_size_y > 0 && vk_prealloc_y.size < vk_prealloc_size_y) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_y.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_y);
|
||||
}
|
||||
vk_prealloc_y = ggml_vk_create_buffer(vk_prealloc_size_y, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_y = ggml_vk_create_buffer_device(vk_prealloc_size_y);
|
||||
}
|
||||
if (vk_prealloc_size_split_k > 0 && vk_prealloc_split_k.size < vk_prealloc_size_split_k) {
|
||||
// Resize buffer
|
||||
if (vk_prealloc_split_k.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_prealloc_split_k);
|
||||
}
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer(vk_prealloc_size_split_k, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_prealloc_split_k = ggml_vk_create_buffer_device(vk_prealloc_size_split_k);
|
||||
}
|
||||
if (vk_staging_size > 0 && vk_staging.size < vk_staging_size) {
|
||||
// Resize buffer
|
||||
if (vk_staging.size > 0) {
|
||||
ggml_vk_destroy_buffer(vk_staging);
|
||||
}
|
||||
vk_staging = ggml_vk_create_buffer(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
vk_staging = ggml_vk_create_buffer_check(vk_staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4138,6 +4216,7 @@ GGML_CALL static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
|
|||
|
||||
GGML_CALL static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
|
@ -4163,14 +4242,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|||
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
||||
}
|
||||
|
||||
if (extra->offset + ggml_nbytes(tensor) > extra->buffer_gpu.size) {
|
||||
std::cerr << "ERROR: Trying to assign tensor " << tensor << " outside of buffer size " << ctx->dev_buffer.size << " requested offset: " << extra->offset << " tensor size: " << ggml_nbytes(tensor) << std::endl;
|
||||
if (tensor->view_src != nullptr) {
|
||||
std::cerr << "view_src: " << tensor->view_src << " extra: " << tensor->view_src->extra << std::endl;
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
tensor->backend = GGML_BACKEND_GPU;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
@ -4248,7 +4319,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
|
|||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
||||
#endif
|
||||
vk_buffer dev_buffer = ggml_vk_create_buffer(size, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
||||
vk_buffer dev_buffer = ggml_vk_create_buffer_device(size);
|
||||
|
||||
ggml_backend_vk_buffer_context * ctx = new ggml_backend_vk_buffer_context(dev_buffer);
|
||||
|
||||
|
@ -4326,9 +4397,12 @@ GGML_CALL static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffe
|
|||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr = ggml_vk_host_malloc(size);
|
||||
|
||||
if (ptr == nullptr) {
|
||||
void * ptr = nullptr;
|
||||
try {
|
||||
ptr = ggml_vk_host_malloc(size);
|
||||
} catch (vk::SystemError& e) {
|
||||
std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
|
||||
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
|
||||
// fallback to cpu buffer
|
||||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
|
@ -4389,7 +4463,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_set_tensor_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type");
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
@ -4409,7 +4483,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_get_tensor_async(" << size << ")" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_vk_buffer_type() && "unsupported buffer type");
|
||||
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type() || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
@ -4429,7 +4503,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_backend_vk_cpy_tensor_async()" << std::endl;
|
||||
#endif
|
||||
if (dst->buffer->buft == ggml_backend_vk_buffer_type() && ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
if ((dst->buffer->buft == ggml_backend_vk_buffer_type() || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) {
|
||||
ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
|
@ -4483,8 +4557,15 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|||
}
|
||||
ggml_vk_preallocate_buffers();
|
||||
|
||||
int last_node = cgraph->n_nodes - 1;
|
||||
|
||||
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
||||
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
|
||||
last_node -= 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
||||
ggml_vk_build_graph(cgraph->nodes[i], i == last_node);
|
||||
}
|
||||
|
||||
ggml_compute_params params = {};
|
||||
|
@ -4499,7 +4580,6 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
|
|||
|
||||
bool ok = ggml_vk_compute_forward(¶ms, node);
|
||||
if (!ok) {
|
||||
std::cerr << "Vulkan disable: " << vk_disable << std::endl;
|
||||
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
}
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
|
@ -4665,7 +4745,7 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
|
|||
// checks
|
||||
|
||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||
void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
|
||||
static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const ggml_tensor *>& done, int level = 0) {
|
||||
if (std::find(done.begin(), done.end(), tensor) != done.end() || level > 10) {
|
||||
return;
|
||||
}
|
||||
|
@ -4683,10 +4763,14 @@ void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<const gg
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
||||
static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, int i0, int i1, int i2, int i3) {
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
return;
|
||||
}
|
||||
i0 = std::max(i0, 5);
|
||||
i1 = std::max(i1, 5);
|
||||
i2 = std::max(i2, 0);
|
||||
i3 = std::max(i3, 0);
|
||||
fprintf(stderr, " ");
|
||||
for (int idx1 = i1 - 5; idx1 < i1 + 5; idx1++) {
|
||||
fprintf(stderr, "%7d ", idx1);
|
||||
|
@ -4698,9 +4782,9 @@ void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, in
|
|||
if (idx0 >= 0 && idx0 < tensor->ne[0] && idx1 >= 0 && idx1 < tensor->ne[1] && i2 >= 0 && i2 < tensor->ne[2] && i3 >= 0 && i3 < tensor->ne[3]) {
|
||||
float val;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
val = *(float *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
||||
}
|
||||
fprintf(stderr, "% 7.2f ", val);
|
||||
} else {
|
||||
|
@ -4711,14 +4795,16 @@ void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * data, in
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
static void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
||||
void * tensor_data = tensor->data;
|
||||
|
||||
if (tensor->backend == GGML_BACKEND_GPU) {
|
||||
const size_t tensor_size = ggml_nbytes(tensor);
|
||||
tensor_data = malloc(tensor_size);
|
||||
|
||||
ggml_vk_buffer_read((vk_buffer *)tensor->data, 0, tensor_data, tensor_size, vk_device.transfer_queue);
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
||||
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size);
|
||||
}
|
||||
|
||||
std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
|
||||
|
@ -4730,10 +4816,10 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
|||
std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
|
||||
}
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor->data, 5, 5, 0, 0);
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
|
||||
std::cerr << std::endl;
|
||||
std::cerr << std::endl << "Result:" << std::endl;
|
||||
ggml_vk_print_tensor_area(tensor, tensor->data, 5, 5, 1, 0);
|
||||
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 1, 0);
|
||||
std::cerr << std::endl;
|
||||
std::vector<const ggml_tensor *> done;
|
||||
ggml_vk_print_graph_origin(tensor, done);
|
||||
|
@ -4743,7 +4829,7 @@ void ggml_vk_print_tensor(const ggml_tensor * tensor, const char * name) {
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
|
||||
return;
|
||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU);
|
||||
if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
|
||||
|
@ -4779,7 +4865,7 @@ void * comp_result;
|
|||
size_t comp_size;
|
||||
size_t comp_nb[GGML_MAX_DIMS];
|
||||
size_t check_counter = 0;
|
||||
void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) {
|
||||
static void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
@ -4796,8 +4882,9 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
ggml_tensor * src1 = tensor->src[1];
|
||||
|
||||
struct ggml_init_params iparams = {
|
||||
.mem_size = 1024*1024*1024,
|
||||
.mem_buffer = NULL,
|
||||
/*.mem_size =*/ 1024*1024*1024,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx = ggml_init(iparams);
|
||||
|
@ -4829,7 +4916,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
for (int i3 = 0; i3 < src0->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src0->ne[2]; i2++) {
|
||||
const int idx = i3*src0->ne[2] + i2;
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1], vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4842,7 +4929,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
if (offset + src0_size >= extra->buffer_gpu.size) {
|
||||
src0_size = extra->buffer_gpu.size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src0_clone->data, src0_size);
|
||||
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
|
@ -4872,7 +4959,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
for (int i3 = 0; i3 < src1->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < src1->ne[2]; i2++) {
|
||||
const int idx = i3*src1->ne[2] + i2;
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1], vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4885,7 +4972,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
if (offset + src1_size >= extra->buffer_gpu.size) {
|
||||
src1_size = extra->buffer_gpu.size - offset;
|
||||
}
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, offset, src1_clone->data, src1_size);
|
||||
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
|
||||
}
|
||||
} else {
|
||||
|
@ -4969,7 +5056,7 @@ void ggml_vk_check_results_0(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
|
||||
if (src1 == nullptr) {
|
||||
tensor_clone = ggml_dup(ctx, src0_clone);
|
||||
tensor_clone->type == tensor->type;
|
||||
tensor_clone->type = tensor->type;
|
||||
} else {
|
||||
tensor_clone = ggml_cpy(ctx, src0_clone, src1_clone);
|
||||
}
|
||||
|
@ -5046,7 +5133,7 @@ void ggml_vk_check_results_1(ggml_compute_params * params, ggml_tensor * tensor)
|
|||
tensor_size = extra->buffer_gpu.size - (extra->offset);
|
||||
}
|
||||
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size, vk_device.transfer_queue);
|
||||
ggml_vk_buffer_read(&extra->buffer_gpu, extra->offset, tensor_data, tensor_size);
|
||||
}
|
||||
|
||||
float first_error_result = -1.0f;
|
||||
|
|
333
ggml.c
333
ggml.c
|
@ -218,6 +218,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
break;
|
||||
}
|
||||
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
return NULL;
|
||||
}
|
||||
return aligned_memory;
|
||||
|
@ -230,6 +231,38 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
inline static void * ggml_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * result = malloc(size);
|
||||
if (result == NULL) {
|
||||
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// calloc
|
||||
inline static void * ggml_calloc(size_t num, size_t size) {
|
||||
if (num == 0 || size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * result = calloc(num, size);
|
||||
if (result == NULL) {
|
||||
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#define GGML_MALLOC(size) ggml_malloc(size)
|
||||
#define GGML_CALLOC(num, size) ggml_calloc(num, size)
|
||||
|
||||
#define GGML_FREE(ptr) free(ptr)
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||
|
||||
|
@ -599,6 +632,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
[GGML_TYPE_IQ3_XXS] = {
|
||||
.type_name = "iq3_xxs",
|
||||
.blck_size = QK_K,
|
||||
.type_size = sizeof(block_iq3_xxs),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
||||
.from_float = quantize_row_iq3_xxs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
|
||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
},
|
||||
[GGML_TYPE_Q8_K] = {
|
||||
.type_name = "q8_K",
|
||||
.blck_size = QK_K,
|
||||
|
@ -2206,6 +2250,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
|
@ -5040,6 +5085,7 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
|||
bool inplace) {
|
||||
GGML_ASSERT(ggml_is_contiguous(a));
|
||||
if (mask) {
|
||||
GGML_ASSERT(mask->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||
GGML_ASSERT(mask->ne[2] == 1);
|
||||
GGML_ASSERT(mask->ne[3] == 1);
|
||||
|
@ -5368,7 +5414,7 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
|||
int s0,
|
||||
int p0,
|
||||
int d0) {
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
|
@ -5446,16 +5492,15 @@ struct ggml_tensor * ggml_conv_depthwise_2d(
|
|||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
|
||||
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
||||
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
||||
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
|
||||
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||
|
||||
new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||
struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
|
||||
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
||||
|
||||
return result;
|
||||
|
@ -5476,7 +5521,8 @@ struct ggml_tensor * ggml_im2col(
|
|||
int p1,
|
||||
int d0,
|
||||
int d1,
|
||||
bool is_2D) {
|
||||
bool is_2D,
|
||||
enum ggml_type dst_type) {
|
||||
|
||||
if(is_2D) {
|
||||
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
||||
|
@ -5500,7 +5546,7 @@ struct ggml_tensor * ggml_im2col(
|
|||
is_2D ? b->ne[3] : 1,
|
||||
};
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
|
||||
int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
|
@ -5525,7 +5571,7 @@ struct ggml_tensor * ggml_conv_2d(
|
|||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N, OH, OW, IC * KH * KW]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
|
@ -5651,12 +5697,13 @@ struct ggml_tensor * ggml_pool_2d(
|
|||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result;
|
||||
const int64_t ne[3] = {
|
||||
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
||||
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
||||
a->ne[2],
|
||||
};
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||
result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||
|
||||
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
@ -5664,7 +5711,6 @@ struct ggml_tensor * ggml_pool_2d(
|
|||
result->op = GGML_OP_POOL_2D;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -5809,6 +5855,8 @@ struct ggml_tensor * ggml_flash_attn_ext(
|
|||
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||
GGML_ASSERT(mask->ne[2] == 1);
|
||||
GGML_ASSERT(mask->ne[3] == 1);
|
||||
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
|
||||
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
|
||||
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
||||
}
|
||||
|
||||
|
@ -7652,6 +7700,7 @@ static void ggml_compute_forward_add(
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
|
@ -7918,6 +7967,7 @@ static void ggml_compute_forward_add1(
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
|
@ -8037,6 +8087,7 @@ static void ggml_compute_forward_acc(
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
|
@ -10788,6 +10839,7 @@ static void ggml_compute_forward_out_prod(
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
|
@ -10967,6 +11019,7 @@ static void ggml_compute_forward_set(
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
|
@ -11163,6 +11216,7 @@ static void ggml_compute_forward_get_rows(
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
|
||||
} break;
|
||||
|
@ -11501,12 +11555,14 @@ static void ggml_compute_forward_soft_max_f32(
|
|||
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||
|
||||
// broadcast the mask across rows
|
||||
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
||||
ggml_fp16_t * mp = src1 ? (ggml_fp16_t *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
||||
|
||||
ggml_vec_cpy_f32 (nc, wp, sp);
|
||||
ggml_vec_scale_f32(nc, wp, scale);
|
||||
if (mp) {
|
||||
ggml_vec_acc_f32(nc, wp, mp);
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
wp[i] += GGML_FP16_TO_FP32(mp[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -11810,6 +11866,7 @@ static void ggml_compute_forward_alibi(
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
|
@ -11886,6 +11943,7 @@ static void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
|
@ -12555,6 +12613,92 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|||
}
|
||||
}
|
||||
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
// src1: image [N, IC, IH, IW]
|
||||
// dst: result [N, OH, OW, IC*KH*KW]
|
||||
static void ggml_compute_forward_im2col_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
||||
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
|
||||
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
|
||||
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
|
||||
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
|
||||
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t N = is_2D ? ne13 : ne12;
|
||||
const int64_t IC = is_2D ? ne12 : ne11;
|
||||
const int64_t IH = is_2D ? ne11 : 1;
|
||||
const int64_t IW = ne10;
|
||||
|
||||
const int64_t KH = is_2D ? ne01 : 1;
|
||||
const int64_t KW = ne00;
|
||||
|
||||
const int64_t OH = is_2D ? ne2 : 1;
|
||||
const int64_t OW = ne1;
|
||||
|
||||
int ofs0 = is_2D ? nb13 : nb12;
|
||||
int ofs1 = is_2D ? nb12 : nb11;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
{
|
||||
float * const wdata = (float *) dst->data;
|
||||
|
||||
for (int64_t in = 0; in < N; in++) {
|
||||
for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
|
||||
for (int64_t iow = 0; iow < OW; iow++) {
|
||||
for (int64_t iic = ith; iic < IC; iic += nth) {
|
||||
|
||||
// micro kernel
|
||||
float * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
||||
const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
|
||||
|
||||
for (int64_t ikh = 0; ikh < KH; ikh++) { // 1
|
||||
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
||||
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
||||
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
|
||||
} else {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = (src_data[iih*IW + iiw]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
// src1: image [N, IC, IH, IW]
|
||||
// dst: result [N, OH, OW, IC*KH*KW]
|
||||
|
@ -12645,14 +12789,14 @@ static void ggml_compute_forward_im2col(
|
|||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_im2col_f16(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
ggml_compute_forward_im2col_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
|
@ -12843,8 +12987,8 @@ static void ggml_compute_forward_pool_2d(
|
|||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(src->type == GGML_TYPE_F32);
|
||||
assert(params->ith == 0);
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(params->ith == 0);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
|
@ -13621,7 +13765,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|||
|
||||
memset(V16, 0, D*sizeof(ggml_fp16_t));
|
||||
|
||||
const float * mp = mask ? (float *)((char *) mask->data + (ir%mask->ne[1])*mask->nb[1]) : NULL;
|
||||
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
||||
|
||||
// k indices
|
||||
const int ik3 = iq3 / rk3;
|
||||
|
@ -13635,7 +13779,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|||
// loop over n_kv and n_head_kv
|
||||
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
||||
for (int64_t ic = 0; ic < nek1; ++ic) {
|
||||
const float mv = mp ? mp[ic] : 0.0f;
|
||||
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
||||
if (mv == -INFINITY) {
|
||||
continue;
|
||||
}
|
||||
|
@ -15439,13 +15583,13 @@ struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
|||
size = ggml_hash_size(size);
|
||||
struct ggml_hash_set result;
|
||||
result.size = size;
|
||||
result.keys = malloc(sizeof(struct ggml_tensor *) * size);
|
||||
result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
|
||||
memset(result.keys, 0, sizeof(struct ggml_tensor *) * size);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void ggml_hash_set_free(struct ggml_hash_set hash_set) {
|
||||
free(hash_set.keys);
|
||||
GGML_FREE(hash_set.keys);
|
||||
}
|
||||
|
||||
struct hash_map {
|
||||
|
@ -15454,17 +15598,17 @@ struct hash_map {
|
|||
};
|
||||
|
||||
static struct hash_map * ggml_new_hash_map(size_t size) {
|
||||
struct hash_map * result = malloc(sizeof(struct hash_map));
|
||||
struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
|
||||
result->set = ggml_hash_set_new(size);
|
||||
result->vals = malloc(sizeof(struct ggml_tensor *) * result->set.size);
|
||||
result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size);
|
||||
memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void ggml_hash_map_free(struct hash_map * map) {
|
||||
ggml_hash_set_free(map->set);
|
||||
free(map->vals);
|
||||
free(map);
|
||||
GGML_FREE(map->vals);
|
||||
GGML_FREE(map);
|
||||
}
|
||||
|
||||
// gradient checkpointing
|
||||
|
@ -17244,12 +17388,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|||
struct ggml_cplan cplan;
|
||||
memset(&cplan, 0, sizeof(struct ggml_cplan));
|
||||
|
||||
int max_tasks = 1;
|
||||
|
||||
// thread scheduling for the different operations + work buffer size estimation
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
||||
|
||||
max_tasks = MAX(max_tasks, n_tasks);
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
switch (node->op) {
|
||||
|
@ -17422,7 +17570,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|||
work_size += CACHE_LINE_SIZE*(n_threads - 1);
|
||||
}
|
||||
|
||||
cplan.n_threads = n_threads;
|
||||
cplan.n_threads = MIN(max_tasks, n_threads);
|
||||
cplan.work_size = work_size;
|
||||
cplan.work_data = NULL;
|
||||
|
||||
|
@ -19145,6 +19293,7 @@ void ggml_quantize_init(enum ggml_type type) {
|
|||
switch (type) {
|
||||
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
||||
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
||||
case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
|
||||
default: // nothing
|
||||
break;
|
||||
}
|
||||
|
@ -19407,6 +19556,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|||
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
{
|
||||
GGML_ASSERT(start % QK_K == 0);
|
||||
GGML_ASSERT(start % n_per_row == 0);
|
||||
size_t start_row = start / n_per_row;
|
||||
size_t row_size = ggml_row_size(type, n_per_row);
|
||||
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||
GGML_ASSERT(result == row_size * nrows);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
|
@ -19533,6 +19691,25 @@ struct gguf_context {
|
|||
void * data;
|
||||
};
|
||||
|
||||
static size_t gguf_type_size(enum gguf_type type) {
|
||||
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
|
||||
return GGUF_TYPE_SIZE[type];
|
||||
}
|
||||
|
||||
static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
|
||||
GGML_ASSERT(info->n_dims <= GGML_MAX_DIMS);
|
||||
GGML_ASSERT(0 <= info->type && info->type < GGML_TYPE_COUNT);
|
||||
|
||||
for (uint32_t i = 0; i < info->n_dims; ++i) {
|
||||
GGML_ASSERT(info->ne[i] > 0);
|
||||
}
|
||||
|
||||
// prevent overflow for total number of elements
|
||||
GGML_ASSERT(INT64_MAX/info->ne[1] > info->ne[0]);
|
||||
GGML_ASSERT(INT64_MAX/info->ne[2] > info->ne[0]*info->ne[1]);
|
||||
GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
|
||||
}
|
||||
|
||||
static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
|
||||
const size_t n = fread(dst, 1, size, file);
|
||||
*offset += n;
|
||||
|
@ -19545,7 +19722,16 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
|
|||
|
||||
bool ok = true;
|
||||
|
||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
|
||||
ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
|
||||
|
||||
// early exit if string length is invalid, prevents from integer overflow
|
||||
if (p->n == SIZE_MAX) {
|
||||
fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
|
||||
return false;
|
||||
}
|
||||
|
||||
p->data = GGML_CALLOC(p->n + 1, 1);
|
||||
|
||||
ok = ok && gguf_fread_el(file, p->data, p->n, offset);
|
||||
|
||||
return ok;
|
||||
|
@ -19618,6 +19804,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// sanity-checks to prevent from integer/buffer overflows
|
||||
|
||||
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
|
||||
ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
|
||||
ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read header\n", __func__);
|
||||
fclose(file);
|
||||
|
@ -19628,7 +19820,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
// read the kv pairs
|
||||
{
|
||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||
|
||||
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
@ -19671,21 +19863,39 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
case GGUF_TYPE_FLOAT64:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
||||
|
||||
ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
|
||||
} break;
|
||||
case GGUF_TYPE_STRING:
|
||||
{
|
||||
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
||||
// prevent from integer overflow in the malloc below
|
||||
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
|
||||
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
|
||||
|
||||
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
default: GGML_ASSERT(false && "invalid type");
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
|
@ -19703,7 +19913,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
// read the tensor infos
|
||||
{
|
||||
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||
|
||||
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
@ -19714,12 +19924,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
ok = ok && gguf_fread_str(file, &info->name, &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
|
||||
|
||||
ok = ok && (info->n_dims <= GGML_MAX_DIMS);
|
||||
|
||||
for (uint32_t j = 0; j < info->n_dims; ++j) {
|
||||
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
|
||||
}
|
||||
|
||||
ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
|
||||
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
|
||||
|
||||
gguf_tensor_info_sanitize(info);
|
||||
|
||||
if (!ok) {
|
||||
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
|
||||
fclose(file);
|
||||
|
@ -19873,12 +20089,12 @@ void gguf_free(struct gguf_context * ctx) {
|
|||
struct gguf_kv * kv = &ctx->kv[i];
|
||||
|
||||
if (kv->key.data) {
|
||||
free(kv->key.data);
|
||||
GGML_FREE(kv->key.data);
|
||||
}
|
||||
|
||||
if (kv->type == GGUF_TYPE_STRING) {
|
||||
if (kv->value.str.data) {
|
||||
free(kv->value.str.data);
|
||||
GGML_FREE(kv->value.str.data);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -19888,16 +20104,16 @@ void gguf_free(struct gguf_context * ctx) {
|
|||
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
|
||||
if (str->data) {
|
||||
free(str->data);
|
||||
GGML_FREE(str->data);
|
||||
}
|
||||
}
|
||||
}
|
||||
free(kv->value.arr.data);
|
||||
GGML_FREE(kv->value.arr.data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(ctx->kv);
|
||||
GGML_FREE(ctx->kv);
|
||||
}
|
||||
|
||||
if (ctx->infos) {
|
||||
|
@ -19905,11 +20121,11 @@ void gguf_free(struct gguf_context * ctx) {
|
|||
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||
|
||||
if (info->name.data) {
|
||||
free(info->name.data);
|
||||
GGML_FREE(info->name.data);
|
||||
}
|
||||
}
|
||||
|
||||
free(ctx->infos);
|
||||
GGML_FREE(ctx->infos);
|
||||
}
|
||||
|
||||
GGML_ALIGNED_FREE(ctx);
|
||||
|
@ -20210,8 +20426,8 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
|
|||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = type;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
|
||||
memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
|
||||
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
|
||||
memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
|
||||
}
|
||||
|
||||
void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
|
||||
|
@ -20220,7 +20436,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
|
|||
ctx->kv[idx].type = GGUF_TYPE_ARRAY;
|
||||
ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
|
||||
ctx->kv[idx].value.arr.n = n;
|
||||
ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
|
||||
ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
|
||||
for (int i = 0; i < n; i++) {
|
||||
struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
|
||||
str->n = strlen(data[i]);
|
||||
|
@ -20247,19 +20463,19 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
|
|||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
|
||||
const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
|
||||
const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
|
||||
for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
|
||||
data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
|
||||
}
|
||||
gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
|
||||
free((void *)data);
|
||||
GGML_FREE((void *)data);
|
||||
} else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
|
||||
GGML_ASSERT(false && "nested arrays not supported");
|
||||
} else {
|
||||
gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -20335,7 +20551,7 @@ struct gguf_buf {
|
|||
|
||||
static struct gguf_buf gguf_buf_init(size_t size) {
|
||||
struct gguf_buf buf = {
|
||||
/*buf.data =*/ size == 0 ? NULL : malloc(size),
|
||||
/*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size),
|
||||
/*buf.size =*/ size,
|
||||
/*buf.offset =*/ 0,
|
||||
};
|
||||
|
@ -20345,7 +20561,7 @@ static struct gguf_buf gguf_buf_init(size_t size) {
|
|||
|
||||
static void gguf_buf_free(struct gguf_buf buf) {
|
||||
if (buf.data) {
|
||||
free(buf.data);
|
||||
GGML_FREE(buf.data);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -20426,7 +20642,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|||
case GGUF_TYPE_FLOAT64:
|
||||
case GGUF_TYPE_BOOL:
|
||||
{
|
||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
|
||||
gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
|
||||
} break;
|
||||
case GGUF_TYPE_STRING:
|
||||
{
|
||||
|
@ -20435,10 +20651,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
|
|||
}
|
||||
} break;
|
||||
case GGUF_TYPE_ARRAY:
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
|
||||
default: GGML_ASSERT(false && "invalid type"); break;
|
||||
}
|
||||
} break;
|
||||
case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
|
||||
default: GGML_ASSERT(false && "invalid type");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -20670,6 +20886,14 @@ int ggml_cpu_has_vulkan(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_kompute(void) {
|
||||
#if defined(GGML_USE_KOMPUTE)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sycl(void) {
|
||||
#if defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
|
@ -20679,7 +20903,8 @@ int ggml_cpu_has_sycl(void) {
|
|||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl();
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
|
||||
ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
|
|
10
ggml.h
10
ggml.h
|
@ -353,6 +353,7 @@ extern "C" {
|
|||
GGML_TYPE_Q8_K = 15,
|
||||
GGML_TYPE_IQ2_XXS = 16,
|
||||
GGML_TYPE_IQ2_XS = 17,
|
||||
GGML_TYPE_IQ3_XXS = 18,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
|
@ -389,6 +390,7 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
@ -1494,7 +1496,8 @@ extern "C" {
|
|||
int p1,
|
||||
int d0,
|
||||
int d1,
|
||||
bool is_2D);
|
||||
bool is_2D,
|
||||
enum ggml_type dst_type);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
|
@ -1643,10 +1646,12 @@ extern "C" {
|
|||
struct ggml_tensor * v,
|
||||
bool masked);
|
||||
|
||||
#define GGML_KQ_MASK_PAD 32
|
||||
|
||||
// q: [n_embd, n_batch, n_head, 1]
|
||||
// k: [n_embd, n_kv, n_head_kv, 1]
|
||||
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
|
||||
// mask: [n_kv, n_batch, 1, 1]
|
||||
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_context * ctx,
|
||||
|
@ -2282,6 +2287,7 @@ extern "C" {
|
|||
GGML_API int ggml_cpu_has_cublas (void);
|
||||
GGML_API int ggml_cpu_has_clblast (void);
|
||||
GGML_API int ggml_cpu_has_vulkan (void);
|
||||
GGML_API int ggml_cpu_has_kompute (void);
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
|
|
|
@ -19,8 +19,8 @@ shader_int8_ext = """
|
|||
|
||||
# Type-specific defines
|
||||
shader_f16_defines = """
|
||||
#define QUANT_K 32
|
||||
#define QUANT_R 2
|
||||
#define QUANT_K 1
|
||||
#define QUANT_R 1
|
||||
|
||||
#define A_TYPE float16_t
|
||||
"""
|
||||
|
|
1
kompute
Submodule
1
kompute
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306
|
102
kompute-shaders/common.comp
Normal file
102
kompute-shaders/common.comp
Normal file
|
@ -0,0 +1,102 @@
|
|||
#extension GL_EXT_shader_16bit_storage: require
|
||||
#extension GL_EXT_shader_8bit_storage: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
||||
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
||||
#extension GL_EXT_control_flow_attributes: enable
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
#define QK4_0 32
|
||||
#define QK4_1 32
|
||||
|
||||
#define GELU_COEF_A 0.044715
|
||||
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
|
||||
#define TWOPI_F 6.283185307179586f
|
||||
|
||||
#define QK_K 256
|
||||
|
||||
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
||||
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
||||
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
|
||||
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
|
||||
|
||||
#define sizeof_block_q4_0 0x12
|
||||
struct block_q4_0 {
|
||||
float16_t d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float md = -8.f * xb.d;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
|
||||
reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_1 0x14
|
||||
struct block_q4_1 {
|
||||
float16_t d;
|
||||
float16_t m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
};
|
||||
mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
|
||||
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
|
||||
const float d2 = d1 / 256.f;
|
||||
const float m = xb.m;
|
||||
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
|
||||
const uint16_t mask1 = mask0 << 8;
|
||||
|
||||
mat4 reg;
|
||||
for (int i=0;i<8;i++) {
|
||||
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
|
||||
reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
|
||||
reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q6_k 210
|
||||
struct block_q6_k {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
||||
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
||||
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
||||
float16_t d; // super-block scale
|
||||
};
|
||||
mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
|
||||
const float16_t d_all = xb.d;
|
||||
|
||||
const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
const uint qhIndex = 32*(il/8) + 16*(il&1);
|
||||
float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
|
||||
const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
|
||||
const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
|
||||
const float16_t ml = float16_t(d_all * sc * 32.f);
|
||||
const float16_t dl = float16_t(d_all * sc * coef);
|
||||
mat4 reg;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
|
||||
: ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
|
||||
reg[i/4][i%4] = dl * q - ml;
|
||||
}
|
||||
return reg;
|
||||
}
|
||||
|
||||
|
||||
#define QK8_0 32
|
||||
// struct block_q8_0 {
|
||||
// float16_t d; // delta
|
||||
// int8_t qs[QK8_0]; // quants
|
||||
// };
|
||||
#define sizeof_block_q8_0 34
|
58
kompute-shaders/op_add.comp
Normal file
58
kompute-shaders/op_add.comp
Normal file
|
@ -0,0 +1,58 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
//int offs; // TODO: needed for GGML_OP_ACC, see metal code
|
||||
} pcs;
|
||||
|
||||
// general-purpose kernel for addition of two tensors
|
||||
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
|
||||
// cons: not very efficient
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
int offs = 0; // TMP (see above)
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
25
kompute-shaders/op_addrow.comp
Normal file
25
kompute-shaders/op_addrow.comp
Normal file
|
@ -0,0 +1,25 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
uint row;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
|
||||
}
|
||||
}
|
52
kompute-shaders/op_cpy_f16_f16.comp
Normal file
52
kompute-shaders/op_cpy_f16_f16.comp
Normal file
|
@ -0,0 +1,52 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
52
kompute-shaders/op_cpy_f16_f32.comp
Normal file
52
kompute-shaders/op_cpy_f16_f32.comp
Normal file
|
@ -0,0 +1,52 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float16_t
|
||||
#define IN_TYPE_SIZE 2
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
52
kompute-shaders/op_cpy_f32_f16.comp
Normal file
52
kompute-shaders/op_cpy_f32_f16.comp
Normal file
|
@ -0,0 +1,52 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float16_t
|
||||
#define OUT_TYPE_SIZE 2
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
52
kompute-shaders/op_cpy_f32_f32.comp
Normal file
52
kompute-shaders/op_cpy_f32_f32.comp
Normal file
|
@ -0,0 +1,52 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define IN_TYPE float
|
||||
#define IN_TYPE_SIZE 4
|
||||
#define OUT_TYPE float
|
||||
#define OUT_TYPE_SIZE 4
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
|
||||
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne2;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
|
||||
|
||||
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
|
||||
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
|
||||
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
|
||||
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
|
||||
|
||||
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
|
||||
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
|
||||
out_[dst_data+i00] = OUT_TYPE(in_[src]);
|
||||
}
|
||||
}
|
30
kompute-shaders/op_diagmask.comp
Normal file
30
kompute-shaders/op_diagmask.comp
Normal file
|
@ -0,0 +1,30 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint n_past;
|
||||
int ne00;
|
||||
int ne01;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i02 = gl_WorkGroupID.z;
|
||||
const uint i01 = gl_WorkGroupID.y;
|
||||
const uint i00 = gl_WorkGroupID.x;
|
||||
|
||||
const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
|
||||
|
||||
if (i00 > pcs.n_past + i01) {
|
||||
out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
|
||||
} else {
|
||||
out_[index + pcs.outOff] = in_[index + pcs.inOff];
|
||||
}
|
||||
}
|
22
kompute-shaders/op_gelu.comp
Normal file
22
kompute-shaders/op_gelu.comp
Normal file
|
@ -0,0 +1,22 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
|
||||
}
|
||||
}
|
17
kompute-shaders/op_getrows.comp
Normal file
17
kompute-shaders/op_getrows.comp
Normal file
|
@ -0,0 +1,17 @@
|
|||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
int z = 0;
|
||||
for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
|
||||
const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
|
||||
const mat4 result = dequantize_block(inIndex, ind%NL);
|
||||
for (uint j = 0; j < 4; ++j) {
|
||||
for (uint k = 0; k < 4; ++k) {
|
||||
const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
|
||||
out_[outIndex] = result[j][k];
|
||||
++z;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
31
kompute-shaders/op_getrows_f16.comp
Normal file
31
kompute-shaders/op_getrows_f16.comp
Normal file
|
@ -0,0 +1,31 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
|
||||
for (int j = 0; j < k; j++) {
|
||||
out_[y + j] = inA[x + j];
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
const int r = inB[i + pcs.inBOff];
|
||||
|
||||
dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
|
||||
}
|
38
kompute-shaders/op_getrows_q4_0.comp
Normal file
38
kompute-shaders/op_getrows_q4_0.comp
Normal file
|
@ -0,0 +1,38 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_0 get_unaligned_block_q4_0(uint index) {
|
||||
block_q4_0 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
|
||||
fres.qs[it] = inA[index+2+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_0 block = get_unaligned_block_q4_0(index);
|
||||
return dequantize_q4_0(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
39
kompute-shaders/op_getrows_q4_1.comp
Normal file
39
kompute-shaders/op_getrows_q4_1.comp
Normal file
|
@ -0,0 +1,39 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 2
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q4_1 get_unaligned_block_q4_1(uint index) {
|
||||
block_q4_1 fres;
|
||||
fres.d = u8BufToFloat16(inA, index);
|
||||
fres.m = u8BufToFloat16(inA, index+2);
|
||||
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
|
||||
fres.qs[it] = inA[index+4+it];
|
||||
}
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q4_1 block = get_unaligned_block_q4_1(index);
|
||||
return dequantize_q4_1(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
44
kompute-shaders/op_getrows_q6_k.comp
Normal file
44
kompute-shaders/op_getrows_q6_k.comp
Normal file
|
@ -0,0 +1,44 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define NL 16
|
||||
#define BYTES_FOR_TYPE 4 /*bytes for float*/
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb01;
|
||||
int nb1;
|
||||
} pcs;
|
||||
|
||||
block_q6_k get_unaligned_block_q6_k(uint index) {
|
||||
block_q6_k fres;
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
|
||||
fres.ql[it] = inA[index + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
|
||||
fres.qh[it] = inA[index + QK_K/2 + it];
|
||||
}
|
||||
[[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
|
||||
fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
|
||||
}
|
||||
fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
return fres;
|
||||
}
|
||||
|
||||
mat4 dequantize_block(uint index, uint il) {
|
||||
const block_q6_k block = get_unaligned_block_q6_k(index);
|
||||
return dequantize_q6_k(block, il);
|
||||
}
|
||||
|
||||
#include "op_getrows.comp"
|
52
kompute-shaders/op_mul.comp
Normal file
52
kompute-shaders/op_mul.comp
Normal file
|
@ -0,0 +1,52 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1024) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int nb00;
|
||||
int nb01;
|
||||
int nb02;
|
||||
int nb03;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
int ne13;
|
||||
int nb10;
|
||||
int nb11;
|
||||
int nb12;
|
||||
int nb13;
|
||||
int ne0;
|
||||
int nb0;
|
||||
int nb1;
|
||||
int nb2;
|
||||
int nb3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint i13 = i03 % pcs.ne13;
|
||||
const uint i12 = i02 % pcs.ne12;
|
||||
const uint i11 = i01 % pcs.ne11;
|
||||
|
||||
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
|
||||
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
|
||||
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
|
||||
|
||||
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
|
||||
const uint i10 = i0 % pcs.ne10;
|
||||
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
|
||||
}
|
||||
}
|
67
kompute-shaders/op_mul_mat_f16.comp
Normal file
67
kompute-shaders/op_mul_mat_f16.comp
Normal file
|
@ -0,0 +1,67 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
int ne10;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb10;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
void main() {
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint rb = gl_WorkGroupID.y*N_F16_F32;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
|
||||
|
||||
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
|
||||
|
||||
for (uint row = 0; row < N_F16_F32; ++row) {
|
||||
uint r1 = rb + row;
|
||||
if (r1 >= pcs.ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
|
||||
|
||||
float sumf = 0;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sumf += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
51
kompute-shaders/op_mul_mat_mat_f32.comp
Normal file
51
kompute-shaders/op_mul_mat_mat_f32.comp
Normal file
|
@ -0,0 +1,51 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic : require
|
||||
#extension GL_EXT_debug_printf : enable
|
||||
|
||||
// device subgroup size
|
||||
layout (local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) readonly buffer tensorInA { float inA[]; };
|
||||
layout(binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne11;
|
||||
int ne12;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb11;
|
||||
uint nb12;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
}
|
||||
pcs;
|
||||
|
||||
|
||||
void main() {
|
||||
uvec3 gid = gl_WorkGroupID;
|
||||
|
||||
uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
|
||||
uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
|
||||
|
||||
const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
|
||||
const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
|
||||
float sum = 0.0f;
|
||||
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
||||
sum += float(inA[x+i]) * float(inB[y+i]);
|
||||
}
|
||||
|
||||
const float all_sum = subgroupAdd(sum);
|
||||
if (subgroupElect()) {
|
||||
out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
33
kompute-shaders/op_mul_mat_q4_0.comp
Normal file
33
kompute-shaders/op_mul_mat_q4_0.comp
Normal file
|
@ -0,0 +1,33 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_0
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_0
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_0 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (sumy * -8.f + acc[0] + acc[1]);
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
35
kompute-shaders/op_mul_mat_q4_1.comp
Normal file
35
kompute-shaders/op_mul_mat_q4_1.comp
Normal file
|
@ -0,0 +1,35 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define BLOCKS_IN_QUANT QK4_1
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_1
|
||||
#define N_ROWS 4
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
// The q4_1 version of this function
|
||||
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
|
||||
vec2 acc = vec2(0.0, 0.0);
|
||||
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
float d = float(u8BufToFloat16(inA, index));
|
||||
float m = float(u8BufToFloat16(inA, index+2));
|
||||
|
||||
float sumy = 0.0f;
|
||||
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
|
||||
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
|
||||
|
||||
const float yl0 = inB[yb + i];
|
||||
const float yl1 = inB[yb + i + 1];
|
||||
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
|
||||
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
|
||||
|
||||
sumy += yl0 + yl1 + yl8 + yl9;
|
||||
|
||||
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
|
||||
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
|
||||
}
|
||||
return d * (acc[0] + acc[1]) + sumy * m;
|
||||
}
|
||||
|
||||
#include "op_mul_mv_q_n.comp"
|
94
kompute-shaders/op_mul_mat_q6_k.comp
Normal file
94
kompute-shaders/op_mul_mat_q6_k.comp
Normal file
|
@ -0,0 +1,94 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define SIZE_OF_BLOCK sizeof_block_q6_k
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y_id = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int gqa;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint8_t kmask1 = uint8_t(0x03);
|
||||
const uint8_t kmask2 = uint8_t(0x0C);
|
||||
const uint8_t kmask3 = uint8_t(0x30);
|
||||
const uint8_t kmask4 = uint8_t(0xC0);
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint r2 = gl_WorkGroupID.z;
|
||||
|
||||
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
|
||||
const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
|
||||
const uint x = row * nb + offset0; // Based from inA without base offset
|
||||
const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
// bits of invocation ID for gl_SubgroupSize=32:
|
||||
// x x x x x
|
||||
// 4 3 2 1 0
|
||||
// ( tid ) ix
|
||||
// ip ( il )
|
||||
|
||||
const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
|
||||
const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
|
||||
const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
|
||||
const uint ip = tid/8; // first or second half of block (0 or 1)
|
||||
const uint il = tid%8; // each half has 8 parts, one per scale
|
||||
const uint n = 4; // 4 scales at a time (and 4 sums)
|
||||
const uint l0 = n*il; // offset into half-block, 0..28
|
||||
const uint is = 8*ip + l0/16; // 0, 1, 8, 9
|
||||
|
||||
const uint y_offset = 128*ip + l0;
|
||||
const uint q_offset_l = 64*ip + l0;
|
||||
const uint q_offset_h = 32*ip + l0;
|
||||
|
||||
for (uint i = ix; i < nb; i += block_stride) {
|
||||
|
||||
const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
|
||||
|
||||
const uint qlIndex = q_offset_l;
|
||||
const uint q2Index = qlIndex + QK_K/8;
|
||||
const uint qhIndex = q_offset_h;
|
||||
const uint y = yy + i * QK_K + y_offset;
|
||||
|
||||
float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
for (uint l = 0; l < n; ++l) {
|
||||
const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
|
||||
const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
|
||||
const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
|
||||
|
||||
sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
|
||||
sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
|
||||
sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
|
||||
sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
|
||||
}
|
||||
|
||||
float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
|
||||
sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
|
||||
}
|
||||
|
||||
const float tot = subgroupAdd(sumf);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
73
kompute-shaders/op_mul_mat_q8_0.comp
Normal file
73
kompute-shaders/op_mul_mat_q8_0.comp
Normal file
|
@ -0,0 +1,73 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#include "op_mul_mv_q_n_pre.comp"
|
||||
|
||||
#define SIZE_OF_D 2
|
||||
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
|
||||
#define NB_Q8_0 8
|
||||
|
||||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const int nr = N_DST;
|
||||
const int nsg = N_SIMDGROUP;
|
||||
const int nw = N_SIMDWIDTH;
|
||||
|
||||
const int nb = pcs.ne00/QK8_0;
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
|
||||
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
|
||||
|
||||
float yl[NB_Q8_0];
|
||||
float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID.x/4;
|
||||
const uint il = gl_SubgroupInvocationID.x%4;
|
||||
|
||||
uint yb = y + ix * QK8_0 + NB_Q8_0*il;
|
||||
|
||||
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
|
||||
for (uint ib = ix; ib < nb; ib += nw/4) {
|
||||
for (int i = 0; i < NB_Q8_0; ++i) {
|
||||
yl[i] = inB[yb + i];
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; row++) {
|
||||
const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
|
||||
float sumq = 0.f;
|
||||
for (int iq = 0; iq < NB_Q8_0; ++iq) {
|
||||
const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
|
||||
sumq += qs_iq * yl[iq];
|
||||
}
|
||||
const float16_t d = u8BufToFloat16(inA, x + block_offset);
|
||||
sumf[row] += sumq*d;
|
||||
}
|
||||
|
||||
yb += NB_Q8_0 * nw;
|
||||
}
|
||||
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect() && first_row + row < pcs.ne01) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
48
kompute-shaders/op_mul_mv_q_n.comp
Normal file
48
kompute-shaders/op_mul_mv_q_n.comp
Normal file
|
@ -0,0 +1,48 @@
|
|||
void main() {
|
||||
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint x = offset0; // Based from inA without base offset
|
||||
const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
|
||||
|
||||
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/2;
|
||||
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
|
||||
|
||||
uint yb = y + ix * BLOCKS_IN_QUANT + il;
|
||||
|
||||
//debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
|
||||
// gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
|
||||
// gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 16) {
|
||||
for (int row = 0; row < N_ROWS; row++) {
|
||||
const uint block_index = x + ib + row * nb;
|
||||
sumf[row] += block_q_n_dot_y(block_index, yb, il);
|
||||
}
|
||||
|
||||
yb += BLOCKS_IN_QUANT * 16;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_ROWS; ++row) {
|
||||
const float tot = subgroupAdd(sumf[row]);
|
||||
if (first_row + row < pcs.ne01 && subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
|
||||
}
|
||||
}
|
||||
}
|
22
kompute-shaders/op_mul_mv_q_n_pre.comp
Normal file
22
kompute-shaders/op_mul_mv_q_n_pre.comp
Normal file
|
@ -0,0 +1,22 @@
|
|||
layout(local_size_x_id = 0) in;
|
||||
layout(local_size_y = 1) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne10;
|
||||
int ne12;
|
||||
int ne0;
|
||||
int ne1;
|
||||
uint r2;
|
||||
uint r3;
|
||||
} pcs;
|
84
kompute-shaders/op_norm.comp
Normal file
84
kompute-shaders/op_norm.comp
Normal file
|
@ -0,0 +1,84 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 256) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
// MEAN
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float mean = sum[0];
|
||||
|
||||
// recenter
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] - mean;
|
||||
}
|
||||
|
||||
// VARIANCE
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
const float variance = sum[0];
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + pcs.eps);
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] *= scale;
|
||||
}
|
||||
}
|
21
kompute-shaders/op_relu.comp
Normal file
21
kompute-shaders/op_relu.comp
Normal file
|
@ -0,0 +1,21 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
|
||||
}
|
||||
}
|
53
kompute-shaders/op_rmsnorm.comp
Normal file
53
kompute-shaders/op_rmsnorm.comp
Normal file
|
@ -0,0 +1,53 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 512) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
uint ne00;
|
||||
uint nb01;
|
||||
float eps;
|
||||
} pcs;
|
||||
|
||||
shared float sum[gl_WorkGroupSize.x];
|
||||
|
||||
void main() {
|
||||
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
|
||||
|
||||
// parallel sum
|
||||
sum[gl_LocalInvocationID.x] = 0.0;
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
|
||||
}
|
||||
|
||||
// reduce
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
|
||||
if (gl_LocalInvocationID.x < i) {
|
||||
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
}
|
||||
|
||||
// broadcast
|
||||
if (gl_LocalInvocationID.x == 0) {
|
||||
sum[0] /= float(pcs.ne00);
|
||||
}
|
||||
barrier();
|
||||
memoryBarrierShared();
|
||||
|
||||
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
|
||||
|
||||
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
|
||||
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
|
||||
out_[y+i00] = in_[x+i00] * scale;
|
||||
}
|
||||
}
|
73
kompute-shaders/op_rope_f16.comp
Normal file
73
kompute-shaders/op_rope_f16.comp
Normal file
|
@ -0,0 +1,73 @@
|
|||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
const bool is_neox = (pcs.mode & 2) != 0;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
const int p = inB[pcs.inBOff + i2];
|
||||
|
||||
float theta = float(p);
|
||||
|
||||
if (!is_neox) {
|
||||
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+1]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
} else {
|
||||
const float inv_ndims = -1.f/pcs.n_dims;
|
||||
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
|
||||
const uint cur_rot = ic;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const uint i0 = ic/2;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = float(inA[src]);
|
||||
const float x1 = float(inA[src+pcs.n_dims/2]);
|
||||
|
||||
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
||||
out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
|
||||
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
|
||||
const uint i0 = ic;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data + 0] = inA[src + 0];
|
||||
out_[dst_data + 1] = inA[src + 1];
|
||||
}
|
||||
}
|
||||
}
|
73
kompute-shaders/op_rope_f32.comp
Normal file
73
kompute-shaders/op_rope_f32.comp
Normal file
|
@ -0,0 +1,73 @@
|
|||
#version 450
|
||||
|
||||
#include "rope_common.comp"
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
void main() {
|
||||
const uint i3 = gl_WorkGroupID.z;
|
||||
const uint i2 = gl_WorkGroupID.y;
|
||||
const uint i1 = gl_WorkGroupID.x;
|
||||
|
||||
const bool is_neox = (pcs.mode & 2) != 0;
|
||||
|
||||
float corr_dims[2];
|
||||
rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
||||
|
||||
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
||||
|
||||
const int p = inB[pcs.inBOff + i2];
|
||||
|
||||
float theta = float(p);
|
||||
|
||||
if (!is_neox) {
|
||||
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+1];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
} else {
|
||||
const float inv_ndims = -1.f/pcs.n_dims;
|
||||
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
|
||||
const uint cur_rot = ic;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
||||
|
||||
theta *= theta_scale;
|
||||
|
||||
const uint i0 = ic/2;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
const float x0 = inA[src];
|
||||
const float x1 = inA[src+pcs.n_dims/2];
|
||||
|
||||
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
||||
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
|
||||
const uint i0 = ic;
|
||||
|
||||
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
||||
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
||||
|
||||
out_[dst_data + 0] = inA[src + 0];
|
||||
out_[dst_data + 1] = inA[src + 1];
|
||||
}
|
||||
}
|
||||
}
|
19
kompute-shaders/op_scale.comp
Normal file
19
kompute-shaders/op_scale.comp
Normal file
|
@ -0,0 +1,19 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint i = gl_WorkGroupID.x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
23
kompute-shaders/op_scale_8.comp
Normal file
23
kompute-shaders/op_scale_8.comp
Normal file
|
@ -0,0 +1,23 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
float scale;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 8;
|
||||
|
||||
for (uint x = 0; x < 8; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
|
||||
}
|
||||
}
|
22
kompute-shaders/op_silu.comp
Normal file
22
kompute-shaders/op_silu.comp
Normal file
|
@ -0,0 +1,22 @@
|
|||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
|
||||
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inOff;
|
||||
uint outOff;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint baseIndex = gl_WorkGroupID.x * 4;
|
||||
|
||||
for (uint x = 0; x < 4; x++) {
|
||||
const uint i = baseIndex + x;
|
||||
const float y = in_[i + pcs.inOff];
|
||||
out_[i + pcs.outOff] = y / (1.0 + exp(-y));
|
||||
}
|
||||
}
|
56
kompute-shaders/op_softmax.comp
Normal file
56
kompute-shaders/op_softmax.comp
Normal file
|
@ -0,0 +1,56 @@
|
|||
// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
|
||||
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
layout(local_size_x_id = 0) in;
|
||||
|
||||
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
||||
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
|
||||
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
|
||||
|
||||
layout(push_constant) uniform PushConstants {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne01;
|
||||
int ne02;
|
||||
float scale;
|
||||
int mask;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
if (gl_SubgroupInvocationID > 31)
|
||||
return;
|
||||
|
||||
const uint i03 = gl_WorkGroupID.z;
|
||||
const uint i02 = gl_WorkGroupID.y;
|
||||
const uint i01 = gl_WorkGroupID.x;
|
||||
|
||||
const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
|
||||
const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
|
||||
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
|
||||
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
||||
|
||||
// parallel max
|
||||
float localMax = uintBitsToFloat(0xFF800000);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
|
||||
}
|
||||
float max_ = subgroupMax(localMax);
|
||||
|
||||
// parallel sum
|
||||
float localSum = 0.0f;
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
|
||||
localSum += exp_psrc0;
|
||||
out_[pdst + i00] = exp_psrc0;
|
||||
}
|
||||
|
||||
const float sum = subgroupAdd(localSum);
|
||||
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
||||
out_[pdst + i00] /= sum;
|
||||
}
|
||||
}
|
67
kompute-shaders/rope_common.comp
Normal file
67
kompute-shaders/rope_common.comp
Normal file
|
@ -0,0 +1,67 @@
|
|||
#include "common.comp"
|
||||
|
||||
// TODO: use a local size of 32 or more (Metal uses 1024)
|
||||
layout(local_size_x = 1) in;
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int n_dims;
|
||||
int mode;
|
||||
int n_orig_ctx;
|
||||
float freq_base;
|
||||
float freq_scale;
|
||||
float ext_factor;
|
||||
float attn_factor;
|
||||
float beta_fast;
|
||||
float beta_slow;
|
||||
uint nb00;
|
||||
uint nb01;
|
||||
uint nb02;
|
||||
uint nb03;
|
||||
int ne0;
|
||||
uint nb0;
|
||||
uint nb1;
|
||||
uint nb2;
|
||||
uint nb3;
|
||||
} pcs;
|
||||
|
||||
float rope_yarn_ramp(const float low, const float high, const float i0) {
|
||||
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
||||
return 1.0f - min(1.0f, max(0.0f, y));
|
||||
}
|
||||
|
||||
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
|
||||
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
||||
void rope_yarn(
|
||||
float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
|
||||
out float cos_theta, out float sin_theta
|
||||
) {
|
||||
// Get n-d rotational scaling corrected for extrapolation
|
||||
float theta_interp = freq_scale * theta_extrap;
|
||||
float theta = theta_interp;
|
||||
if (ext_factor != 0.0f) {
|
||||
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
|
||||
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
||||
|
||||
// Get n-d magnitude scaling corrected for interpolation
|
||||
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
|
||||
}
|
||||
cos_theta = cos(theta) * mscale;
|
||||
sin_theta = sin(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
|
||||
return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
|
||||
}
|
||||
|
||||
void rope_yarn_corr_dims(
|
||||
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
|
||||
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
|
||||
}
|
382
llama.cpp
382
llama.cpp
|
@ -15,6 +15,8 @@
|
|||
# include "ggml-vulkan.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
# include "ggml-sycl.h"
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
# include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
@ -1160,10 +1162,10 @@ struct llama_mlock {
|
|||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) const {
|
||||
|
@ -1315,6 +1317,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||
if (buft == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
|
@ -2362,6 +2369,7 @@ struct llama_model_loader {
|
|||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
||||
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
||||
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
@ -2707,9 +2715,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
||||
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
||||
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
|
@ -4109,7 +4118,7 @@ static bool llm_load_tensors(
|
|||
}
|
||||
|
||||
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
||||
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
||||
try {
|
||||
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
||||
|
||||
|
@ -4130,6 +4139,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
if (params.n_gpu_layers > 0 && (
|
||||
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|
||||
|| !(
|
||||
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
||||
)
|
||||
)) {
|
||||
// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
|
||||
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
|
||||
params.n_gpu_layers = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
|
@ -4677,126 +4702,6 @@ struct llm_build_context {
|
|||
ctx0 = nullptr;
|
||||
}
|
||||
}
|
||||
struct ggml_cgraph * build_orion() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
// if (model.layers[il].bq) {
|
||||
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
// cb(Qcur, "Qcur", il);
|
||||
// }
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
// if (model.layers[il].bk) {
|
||||
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
// cb(Kcur, "Kcur", il);
|
||||
// }
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
// if (model.layers[il].bv) {
|
||||
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
// cb(Vcur, "Vcur", il);
|
||||
// }
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
struct ggml_cgraph * build_llama() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
@ -4816,7 +4721,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -5000,7 +4905,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -5121,7 +5026,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -5243,7 +5148,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
|
@ -5340,7 +5245,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
if (do_rope_shift) {
|
||||
|
@ -5543,7 +5448,7 @@ struct llm_build_context {
|
|||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -5633,7 +5538,7 @@ struct llm_build_context {
|
|||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
inpL = llm_build_norm(ctx0, inpL, hparams,
|
||||
|
@ -5726,7 +5631,7 @@ struct llm_build_context {
|
|||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
@ -5826,7 +5731,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -5949,7 +5854,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -6063,7 +5968,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -6184,7 +6089,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -6306,7 +6211,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -6413,7 +6318,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
|
@ -6511,7 +6416,7 @@ struct llm_build_context {
|
|||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
|
@ -6600,6 +6505,125 @@ struct llm_build_context {
|
|||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_orion() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
// if (model.layers[il].bq) {
|
||||
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
// cb(Qcur, "Qcur", il);
|
||||
// }
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
// if (model.layers[il].bk) {
|
||||
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
// cb(Kcur, "Kcur", il);
|
||||
// }
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
// if (model.layers[il].bv) {
|
||||
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
// cb(Vcur, "Vcur", il);
|
||||
// }
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
|
@ -6889,11 +6913,6 @@ static int llama_decode_internal(
|
|||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
||||
if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
|
||||
n_threads = 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||
|
@ -9250,6 +9269,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
else if (new_type != GGML_TYPE_Q8_0) {
|
||||
new_type = GGML_TYPE_Q6_K;
|
||||
}
|
||||
} else if (name == "token_embd.weight") {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
||||
new_type = GGML_TYPE_Q2_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
||||
if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
||||
|
@ -9260,7 +9286,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
|
||||
++qs.i_ffn_down;
|
||||
}
|
||||
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
||||
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
|
||||
|
@ -9268,6 +9293,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
||||
}
|
||||
|
@ -9305,6 +9333,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
// if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
|
||||
//}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
||||
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
||||
|
@ -9336,13 +9367,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||
if (arch != LLM_ARCH_FALCON) {
|
||||
if (qs.model.hparams.n_expert == 8) {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
||||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
||||
new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
} else {
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||
}
|
||||
|
@ -9385,7 +9417,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
bool convert_incompatible_tensor = false;
|
||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
|
||||
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
||||
new_type == GGML_TYPE_IQ3_XXS) {
|
||||
int nx = tensor->ne[0];
|
||||
int ny = tensor->ne[1];
|
||||
if (nx % QK_K != 0) {
|
||||
|
@ -9399,6 +9432,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
switch (new_type) {
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ2_XS:
|
||||
case GGML_TYPE_IQ3_XXS:
|
||||
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
||||
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
||||
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
||||
|
@ -9440,6 +9474,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
@ -10090,18 +10125,45 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||
return result;
|
||||
}
|
||||
|
||||
int32_t llama_max_devices(void) {
|
||||
return LLAMA_MAX_DEVICES;
|
||||
size_t llama_max_devices(void) {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
#elif defined(GGML_USE_CUBLAS)
|
||||
return GGML_CUDA_MAX_DEVICES;
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return GGML_SYCL_MAX_DEVICES;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool llama_mmap_supported(void) {
|
||||
bool llama_supports_mmap(void) {
|
||||
return llama_mmap::SUPPORTED;
|
||||
}
|
||||
|
||||
bool llama_mlock_supported(void) {
|
||||
bool llama_supports_mlock(void) {
|
||||
return llama_mlock::SUPPORTED;
|
||||
}
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
// deprecated:
|
||||
bool llama_mmap_supported(void) {
|
||||
return llama_supports_mmap();
|
||||
}
|
||||
|
||||
bool llama_mlock_supported(void) {
|
||||
return llama_supports_mlock();
|
||||
}
|
||||
|
||||
void llama_backend_init(bool numa) {
|
||||
ggml_time_init();
|
||||
|
||||
|
@ -10188,7 +10250,10 @@ struct llama_context * llama_new_context_with_model(
|
|||
const auto & hparams = model->hparams;
|
||||
auto & cparams = ctx->cparams;
|
||||
|
||||
cparams.n_batch = params.n_batch;
|
||||
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
||||
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
||||
cparams.n_batch = std::max((uint32_t) GGML_KQ_MASK_PAD, params.n_batch);
|
||||
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
||||
|
@ -10295,6 +10360,16 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
auto * backend = ggml_backend_kompute_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#endif
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
|
@ -10358,6 +10433,9 @@ struct llama_context * llama_new_context_with_model(
|
|||
|
||||
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||
|
||||
// zero-out the input buffer to prevent NaNs in padded tensors
|
||||
ggml_backend_buffer_clear(ctx->buf_input, 0);
|
||||
|
||||
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buffer_name(ctx->buf_input),
|
||||
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
||||
|
|
29
llama.h
29
llama.h
|
@ -3,15 +3,7 @@
|
|||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#include "ggml-sycl.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
||||
#else
|
||||
#define LLAMA_MAX_DEVICES 1
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
@ -49,11 +41,6 @@
|
|||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 4
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -111,6 +98,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
@ -199,7 +187,7 @@ extern "C" {
|
|||
// LLAMA_SPLIT_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
|
@ -336,9 +324,14 @@ extern "C" {
|
|||
|
||||
LLAMA_API int64_t llama_time_us(void);
|
||||
|
||||
LLAMA_API int32_t llama_max_devices(void);
|
||||
LLAMA_API bool llama_mmap_supported (void);
|
||||
LLAMA_API bool llama_mlock_supported(void);
|
||||
LLAMA_API size_t llama_max_devices(void);
|
||||
|
||||
LLAMA_API bool llama_supports_mmap (void);
|
||||
LLAMA_API bool llama_supports_mlock (void);
|
||||
LLAMA_API bool llama_supports_gpu_offload(void);
|
||||
|
||||
LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
||||
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
||||
|
||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||
|
||||
|
|
19
scripts/install-oneapi.bat
Normal file
19
scripts/install-oneapi.bat
Normal file
|
@ -0,0 +1,19 @@
|
|||
:: MIT license
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
|
||||
set URL=%1
|
||||
set COMPONENTS=%2
|
||||
|
||||
curl.exe --output %TEMP%\webimage.exe --url %URL% --retry 5 --retry-delay 5
|
||||
start /b /wait %TEMP%\webimage.exe -s -x -f webimage_extracted --log extract.log
|
||||
del %TEMP%\webimage.exe
|
||||
if "%COMPONENTS%"=="" (
|
||||
webimage_extracted\bootstrapper.exe -s --action install --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
|
||||
) else (
|
||||
webimage_extracted\bootstrapper.exe -s --action install --components=%COMPONENTS% --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
|
||||
)
|
||||
set installer_exit_code=%ERRORLEVEL%
|
||||
rd /s/q "webimage_extracted"
|
||||
exit /b %installer_exit_code%
|
|
@ -1 +1 @@
|
|||
f2a9472b23cf27e672ed70a2a6eb078f7b060f18
|
||||
475cbad5c1c834e31e26a2283bc1413181644360
|
||||
|
|
|
@ -227,6 +227,14 @@ static std::string var_to_str(ggml_type type) {
|
|||
return ggml_type_name(type);
|
||||
}
|
||||
|
||||
static std::string var_to_str(ggml_op_pool pool) {
|
||||
switch (pool) {
|
||||
case GGML_OP_POOL_AVG: return "avg";
|
||||
case GGML_OP_POOL_MAX: return "max";
|
||||
default: return std::to_string(pool);
|
||||
}
|
||||
}
|
||||
|
||||
#define VARS_TO_STR1(a) VAR_TO_STR(a)
|
||||
#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
|
||||
#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
|
||||
|
@ -238,6 +246,7 @@ static std::string var_to_str(ggml_type type) {
|
|||
#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
|
||||
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
||||
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
||||
#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
static bool inline _isinf(float f) {
|
||||
|
@ -370,12 +379,15 @@ struct test_case {
|
|||
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// check if backends support op
|
||||
// check if the backends support the ops
|
||||
bool supported = true;
|
||||
for (ggml_backend_t backend : {backend1, backend2}) {
|
||||
if (!ggml_backend_supports_op(backend, out)) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (!ggml_backend_supports_op(backend, t)) {
|
||||
printf("not supported [%s] ", ggml_backend_name(backend));
|
||||
supported = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!supported) {
|
||||
|
@ -626,6 +638,13 @@ struct test_unary : public test_case {
|
|||
ggml_tensor * out = ggml_unary(ctx, in, op);
|
||||
return out;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
// test extended range of values to check for NaNs in GELU
|
||||
init_tensor_uniform(t, -150.f, 150.f);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_GET_ROWS
|
||||
|
@ -1066,18 +1085,24 @@ struct test_diag_mask_inf : public test_case {
|
|||
struct test_soft_max : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const float scale;
|
||||
const bool mask;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
return VARS_TO_STR4(type, ne, scale, mask);
|
||||
}
|
||||
|
||||
test_soft_max(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10})
|
||||
: type(type), ne(ne) {}
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
||||
float scale = 1.0f,
|
||||
bool mask = false)
|
||||
: type(type), ne(ne), scale(scale), mask(mask) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_soft_max(ctx, a);
|
||||
ggml_tensor * b = nullptr;
|
||||
if (mask) { b = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, ne[0], ne[1]); }
|
||||
ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
@ -1146,10 +1171,45 @@ struct test_alibi : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// GGML_OP_POOL2D
|
||||
struct test_pool2d : public test_case {
|
||||
enum ggml_op_pool pool_type;
|
||||
const ggml_type type_input;
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
// kernel size
|
||||
const int k0;
|
||||
const int k1;
|
||||
// stride
|
||||
const int s0;
|
||||
const int s1;
|
||||
// padding
|
||||
const int p0;
|
||||
const int p1;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
|
||||
}
|
||||
|
||||
test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
|
||||
ggml_type type_input = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
||||
int k0 = 3, int k1 = 3,
|
||||
int s0 = 1, int s1 = 1,
|
||||
int p0 = 1, int p1 = 1)
|
||||
: pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
||||
ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_IM2COL
|
||||
struct test_im2col : public test_case {
|
||||
const ggml_type type_input;
|
||||
const ggml_type type_kernel;
|
||||
const ggml_type dst_type;
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
const std::array<int64_t, 4> ne_kernel;
|
||||
// stride
|
||||
|
@ -1165,22 +1225,22 @@ struct test_im2col : public test_case {
|
|||
const bool is_2D;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR11(type_input, type_kernel, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
|
||||
return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
|
||||
}
|
||||
|
||||
test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16,
|
||||
test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
|
||||
std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
|
||||
int s0 = 1, int s1 = 1,
|
||||
int p0 = 1, int p1 = 1,
|
||||
int d0 = 1, int d1 = 1,
|
||||
bool is_2D = true)
|
||||
: type_input(type_input), type_kernel(type_kernel), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
|
||||
: type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
|
||||
ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D);
|
||||
ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
@ -1412,7 +1472,7 @@ struct test_flash_attn_ext : public test_case {
|
|||
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
||||
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv, nb, 1, 1);
|
||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
|
||||
return out;
|
||||
}
|
||||
|
@ -1446,7 +1506,7 @@ struct test_attn : public test_case {
|
|||
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
||||
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, hs, nh, 1); // transposed
|
||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv, nb, 1, 1);
|
||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, nb, 1, 1);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
|
@ -1544,6 +1604,393 @@ struct test_moe : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
enum llm_norm_type {
|
||||
LLM_NORM,
|
||||
LLM_NORM_RMS,
|
||||
};
|
||||
|
||||
struct llama_hparams {
|
||||
uint32_t n_vocab;
|
||||
uint32_t n_embd;
|
||||
uint32_t n_head;
|
||||
uint32_t n_head_kv;
|
||||
static constexpr uint32_t n_layer = 1;
|
||||
uint32_t n_rot;
|
||||
uint32_t n_embd_head; // dimension of values (d_v)
|
||||
uint32_t n_ff;
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
||||
// cparams
|
||||
static constexpr uint32_t n_ctx = 512; // user-specified context size
|
||||
static constexpr uint32_t n_orig_ctx = n_ctx;
|
||||
|
||||
// batch
|
||||
int32_t n_tokens;
|
||||
|
||||
// llm_build_context
|
||||
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
|
||||
static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
|
||||
|
||||
uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
|
||||
return n_embd_head * n_head_kv;
|
||||
}
|
||||
};
|
||||
|
||||
// LLM base class
|
||||
struct test_llm : public test_case {
|
||||
llama_hparams hp;
|
||||
|
||||
protected:
|
||||
test_llm(llama_hparams hp)
|
||||
: hp(std::move(hp)) {
|
||||
}
|
||||
|
||||
public:
|
||||
struct ggml_tensor * llm_build_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * mw,
|
||||
struct ggml_tensor * mb,
|
||||
llm_norm_type type) {
|
||||
switch (type) {
|
||||
case LLM_NORM: cur = ggml_norm (ctx, cur, hp.f_norm_eps); break;
|
||||
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
|
||||
}
|
||||
cur = ggml_mul(ctx, cur, mw);
|
||||
if (mb) {
|
||||
cur = ggml_add(ctx, cur, mb);
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
void llm_build_kv_store(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * k_l,
|
||||
struct ggml_tensor * v_l,
|
||||
struct ggml_tensor * k_cur,
|
||||
struct ggml_tensor * v_cur) {
|
||||
// compute the transposed [n_tokens, n_embd] V matrix
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
|
||||
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
|
||||
(ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
|
||||
( hp.n_ctx)*ggml_element_size(v_l),
|
||||
(hp.kv_head)*ggml_element_size(v_l));
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_cpy(ctx, k_cur, k_cache_view);
|
||||
ggml_cpy(ctx, v_cur_t, v_cache_view);
|
||||
}
|
||||
|
||||
// if max_alibi_bias > 0 then apply ALiBi
|
||||
struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * k_l,
|
||||
struct ggml_tensor * v_l,
|
||||
struct ggml_tensor * q_cur,
|
||||
struct ggml_tensor * kq_mask,
|
||||
float kq_scale) {
|
||||
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
||||
|
||||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx, k_l,
|
||||
hp.n_embd_head, hp.n_kv, hp.n_head_kv,
|
||||
ggml_row_size(k_l->type, hp.n_embd_gqa()),
|
||||
ggml_row_size(k_l->type, hp.n_embd_head),
|
||||
0);
|
||||
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
|
||||
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
|
||||
|
||||
// split cached v into n_head heads
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, v_l,
|
||||
hp.n_kv, hp.n_embd_head, hp.n_head_kv,
|
||||
ggml_element_size(v_l)*hp.n_ctx,
|
||||
ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
|
||||
0);
|
||||
|
||||
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
||||
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
|
||||
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
|
||||
|
||||
struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
||||
cur = ggml_mul_mat(ctx, wo, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
void initialize_tensors(ggml_context * ctx) override {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->type == GGML_TYPE_I32) {
|
||||
// pos
|
||||
std::vector<int> data(hp.n_tokens);
|
||||
for (int i = 0; i < hp.n_tokens; i++) {
|
||||
data[i] = rand() % hp.n_ctx;
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
|
||||
} else {
|
||||
init_tensor_uniform(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// Llama
|
||||
struct test_llama : public test_llm {
|
||||
static constexpr float freq_base = 10000.0f;
|
||||
static constexpr float freq_scale = 1.0f;
|
||||
static constexpr float ext_factor = 0.0f;
|
||||
static constexpr float attn_factor = 1.0f;
|
||||
static constexpr float beta_fast = 32.0f;
|
||||
static constexpr float beta_slow = 1.0f;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "LLAMA";
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
auto n_tokens = hp.n_tokens;
|
||||
return VARS_TO_STR1(n_tokens);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 2e-3;
|
||||
}
|
||||
|
||||
test_llama(int n_tokens = 1)
|
||||
: test_llm({
|
||||
/*n_vocab =*/ 32000,
|
||||
/*n_embd =*/ 3200,
|
||||
/*n_head =*/ 32,
|
||||
/*n_head_kv =*/ 32,
|
||||
/*n_rot =*/ 100,
|
||||
/*n_embd_head =*/ 100,
|
||||
/*n_ff =*/ 8640,
|
||||
/*f_norm_eps =*/ 0.f,
|
||||
/*f_norm_rms_eps =*/ 1e-5f,
|
||||
/*n_tokens =*/ n_tokens,
|
||||
}) {
|
||||
}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
||||
|
||||
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
|
||||
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
|
||||
ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
||||
ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos,
|
||||
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
|
||||
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
||||
|
||||
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
|
||||
|
||||
// feed-forward network
|
||||
ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
|
||||
|
||||
ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
||||
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
||||
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
|
||||
cur = ggml_mul_mat(ctx, ffn_gate, cur);
|
||||
cur = ggml_silu(ctx, cur);
|
||||
cur = ggml_mul(ctx, cur, tmp);
|
||||
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
||||
|
||||
cur = ggml_add(ctx, cur, ffn_inp);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
|
||||
|
||||
// lm_head
|
||||
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
|
||||
cur = ggml_mul_mat(ctx, output, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
// Falcon
|
||||
struct test_falcon : public test_llm {
|
||||
static constexpr float freq_base = 10000.0f;
|
||||
static constexpr float freq_scale = 1.0f;
|
||||
static constexpr float ext_factor = 0.0f;
|
||||
static constexpr float attn_factor = 1.0f;
|
||||
static constexpr float beta_fast = 32.0f;
|
||||
static constexpr float beta_slow = 1.0f;
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
return "FALCON";
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
auto n_tokens = hp.n_tokens;
|
||||
return VARS_TO_STR1(n_tokens);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 2e-3;
|
||||
}
|
||||
|
||||
test_falcon(int n_tokens = 1)
|
||||
: test_llm({
|
||||
/*n_vocab =*/ 32000,
|
||||
/*n_embd =*/ 3200,
|
||||
/*n_head =*/ 50,
|
||||
/*n_head_kv =*/ 1,
|
||||
/*n_rot =*/ 64,
|
||||
/*n_embd_head =*/ 64,
|
||||
/*n_ff =*/ 8640,
|
||||
/*f_norm_eps =*/ 1e-5f,
|
||||
/*f_norm_rms_eps =*/ 0.f,
|
||||
/*n_tokens =*/ n_tokens,
|
||||
}) {
|
||||
}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
||||
|
||||
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
|
||||
for (uint32_t il = 0; il < hp.n_layer; ++il) {
|
||||
// norm
|
||||
ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = attn_norm;
|
||||
|
||||
ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
|
||||
|
||||
cur = ggml_mul_mat(ctx, wqkv, cur);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
|
||||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
|
||||
|
||||
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = cur;
|
||||
|
||||
// feed forward
|
||||
{
|
||||
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
|
||||
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
|
||||
cur = attn_norm;
|
||||
cur = ggml_mul_mat(ctx, ffn_up, cur);
|
||||
cur = ggml_gelu(ctx, cur);
|
||||
cur = ggml_mul_mat(ctx, ffn_down, cur);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx, cur, ffn_inp);
|
||||
|
||||
cur = ggml_add(ctx, cur, inpL);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
|
||||
cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
|
||||
|
||||
// lm_head
|
||||
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
|
||||
cur = ggml_mul_mat(ctx, output, cur);
|
||||
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
std::default_random_engine rng(0);
|
||||
|
@ -1557,6 +2004,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
GGML_TYPE_IQ3_XXS,
|
||||
};
|
||||
|
||||
// unary ops
|
||||
|
@ -1578,6 +2026,27 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
}
|
||||
|
||||
for (ggml_type type_input : {GGML_TYPE_F32}) {
|
||||
for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
|
||||
for (int k0 : {1, 3}) {
|
||||
for (int k1 : {1, 3}) {
|
||||
for (int s0 : {1, 2}) {
|
||||
for (int s1 : {1, 2}) {
|
||||
for (int p0 : {0, 1}) {
|
||||
for (int p1 : {0, 1}) {
|
||||
test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
|
||||
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
|
||||
|
@ -1593,8 +2062,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
|
||||
test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
|
||||
|
||||
for (ggml_type type : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
|
||||
for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
||||
for (ggml_type type_dst : all_types) {
|
||||
test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_cont());
|
||||
|
@ -1696,6 +2167,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
exponent <<= 1;
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));
|
||||
|
||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
|
||||
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512)); // llama 13B
|
||||
|
@ -1710,7 +2184,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
|
||||
test_cases.emplace_back(new test_alibi());
|
||||
test_cases.emplace_back(new test_im2col());
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
||||
|
||||
|
@ -1726,6 +2199,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_pad());
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
||||
#if 0
|
||||
for (int hs : { 64, 80, 96, 112, 128, 256, }) {
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, 2048, 4096, }) {
|
||||
|
@ -1736,6 +2210,18 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int hs : { 128, }) {
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 2, 4, 8, 512 }) {
|
||||
test_cases.emplace_back(new test_attn (hs, nh, kv, nb));
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(__SANITIZE_THREAD__)
|
||||
// FIXME: these tests use too much memory with thread sanitizer
|
||||
|
@ -1743,6 +2229,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
|
||||
#endif
|
||||
|
||||
// these tests are disabled to save execution time, but they can be handy for debugging
|
||||
#if 0
|
||||
test_cases.emplace_back(new test_llama(1));
|
||||
test_cases.emplace_back(new test_llama(2));
|
||||
test_cases.emplace_back(new test_falcon(1));
|
||||
test_cases.emplace_back(new test_falcon(2));
|
||||
#endif
|
||||
|
||||
// run tests
|
||||
if (mode == MODE_TEST) {
|
||||
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
#include "llama.h"
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
int main(void) {}
|
||||
|
|
|
@ -17,7 +17,9 @@ constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
|
|||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
||||
constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
|
||||
|
||||
static const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||
|
||||
|
@ -135,18 +137,21 @@ int main(int argc, char * argv[]) {
|
|||
}
|
||||
|
||||
const ggml_type ei = (ggml_type)i;
|
||||
|
||||
if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) {
|
||||
printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei));
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("Testing %s\n", ggml_type_name((ggml_type) i));
|
||||
ggml_quantize_init(ei);
|
||||
|
||||
if (qfns.from_float && qfns.to_float) {
|
||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||
const float max_quantization_error =
|
||||
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
|
||||
failed = !(total_error < max_quantization_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
|
@ -161,7 +166,9 @@ int main(int argc, char * argv[]) {
|
|||
}
|
||||
|
||||
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
|
||||
failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
|
||||
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ3_XXS ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR;
|
||||
failed = !(vec_dot_error < max_allowed_error);
|
||||
num_failed += failed;
|
||||
if (failed || verbose) {
|
||||
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
|
||||
|
|
|
@ -278,6 +278,8 @@ int main(int argc, char * argv[]) {
|
|||
if (qfns.from_float && qfns.to_float) {
|
||||
printf("%s\n", ggml_type_name(type));
|
||||
|
||||
ggml_quantize_init(type);
|
||||
|
||||
if (params.op_quantize_row_q_reference) {
|
||||
printf(" quantize_row_q_reference\n");
|
||||
for (size_t size : params.test_sizes) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue