Merge branch 'sycl_win_build' of https://github.com/NeoZhangJianyu/llama.cpp into sycl_win_build

This commit is contained in:
Zhang 2024-01-30 17:40:53 +08:00
commit 2f1262f46e
52 changed files with 4467 additions and 71 deletions

1
.ecrc
View file

@ -1,4 +1,5 @@
{
"Exclude": ["^\\.gitmodules$"],
"Disable": {
"IndentSize": true
}

View file

@ -337,6 +337,7 @@ jobs:
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.3.261.1
strategy:
matrix:
@ -353,6 +354,8 @@ jobs:
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
steps:
- name: Clone
@ -361,6 +364,12 @@ jobs:
with:
fetch-depth: 0
- name: Clone Kompute submodule
id: clone_kompute
if: ${{ matrix.build == 'kompute' }}
run: |
git submodule update --init kompute
- name: Download OpenCL SDK
id: get_opencl
if: ${{ matrix.build == 'clblast' }}
@ -395,6 +404,15 @@ jobs:
$lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
& $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'kompute' }}
run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Build
id: cmake_build
run: |
@ -432,7 +450,8 @@ jobs:
- name: Test
id: cmake_test
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
# not all machines have native AVX-512
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900

3
.gitmodules vendored Normal file
View file

@ -0,0 +1,3 @@
[submodule "kompute"]
path = kompute
url = https://github.com/nomic-ai/kompute.git

View file

@ -103,6 +103,7 @@ option(LLAMA_VULKAN "llama: use Vulkan"
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
@ -422,7 +423,13 @@ if (LLAMA_VULKAN)
if (Vulkan_FOUND)
message(STATUS "Vulkan found")
set(GGML_HEADERS_VULKAN ggml-vulkan.h)
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
add_compile_definitions(GGML_USE_VULKAN)
@ -478,7 +485,6 @@ if (LLAMA_HIPBLAS)
endif()
endif()
if (LLAMA_SYCL)
if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
@ -508,6 +514,160 @@ if (LLAMA_SYCL)
endif()
endif()
if (LLAMA_KOMPUTE)
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
find_package(Vulkan COMPONENTS glslc REQUIRED)
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
if (NOT glslc_executable)
message(FATAL_ERROR "glslc not found")
endif()
function(compile_shader)
set(options)
set(oneValueArgs)
set(multiValueArgs SOURCES)
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
foreach(source ${compile_shader_SOURCES})
get_filename_component(filename ${source} NAME)
set(spv_file ${filename}.spv)
add_custom_command(
OUTPUT ${spv_file}
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
COMMENT "Compiling ${source} to ${spv_file}"
)
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
set(FILE_NAME "shader${RAW_FILE_NAME}")
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
if(CMAKE_GENERATOR MATCHES "Visual Studio")
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
)
else()
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
)
endif()
endforeach()
endfunction()
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
message(STATUS "Kompute found")
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
add_subdirectory(kompute)
# Compile our shaders
compile_shader(SOURCES
kompute-shaders/op_scale.comp
kompute-shaders/op_scale_8.comp
kompute-shaders/op_add.comp
kompute-shaders/op_addrow.comp
kompute-shaders/op_mul.comp
kompute-shaders/op_silu.comp
kompute-shaders/op_relu.comp
kompute-shaders/op_gelu.comp
kompute-shaders/op_softmax.comp
kompute-shaders/op_norm.comp
kompute-shaders/op_rmsnorm.comp
kompute-shaders/op_diagmask.comp
kompute-shaders/op_mul_mat_mat_f32.comp
kompute-shaders/op_mul_mat_f16.comp
kompute-shaders/op_mul_mat_q8_0.comp
kompute-shaders/op_mul_mat_q4_0.comp
kompute-shaders/op_mul_mat_q4_1.comp
kompute-shaders/op_mul_mat_q6_k.comp
kompute-shaders/op_getrows_f16.comp
kompute-shaders/op_getrows_q4_0.comp
kompute-shaders/op_getrows_q4_1.comp
kompute-shaders/op_getrows_q6_k.comp
kompute-shaders/op_rope_f16.comp
kompute-shaders/op_rope_f32.comp
kompute-shaders/op_cpy_f16_f16.comp
kompute-shaders/op_cpy_f16_f32.comp
kompute-shaders/op_cpy_f32_f16.comp
kompute-shaders/op_cpy_f32_f32.comp
)
# Create a custom target for our generated shaders
add_custom_target(generated_shaders DEPENDS
shaderop_scale.h
shaderop_scale_8.h
shaderop_add.h
shaderop_addrow.h
shaderop_mul.h
shaderop_silu.h
shaderop_relu.h
shaderop_gelu.h
shaderop_softmax.h
shaderop_norm.h
shaderop_rmsnorm.h
shaderop_diagmask.h
shaderop_mul_mat_mat_f32.h
shaderop_mul_mat_f16.h
shaderop_mul_mat_q8_0.h
shaderop_mul_mat_q4_0.h
shaderop_mul_mat_q4_1.h
shaderop_mul_mat_q6_k.h
shaderop_getrows_f16.h
shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h
shaderop_getrows_q6_k.h
shaderop_rope_f16.h
shaderop_rope_f32.h
shaderop_cpy_f16_f16.h
shaderop_cpy_f16_f32.h
shaderop_cpy_f32_f16.h
shaderop_cpy_f32_f32.h
)
# Create a custom command that depends on the generated_shaders
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
DEPENDS generated_shaders
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
)
# Add the stamp to the main sources to ensure dependency tracking
set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
add_compile_definitions(GGML_USE_KOMPUTE)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
else()
message(WARNING "Kompute not found")
endif()
endif()
function(get_flags CCID CCVER)
set(C_FLAGS "")
set(CXX_FLAGS "")
@ -850,12 +1010,14 @@ add_library(ggml OBJECT
ggml-backend.h
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
)
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
@ -932,7 +1094,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_VULKAN}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

View file

@ -334,7 +334,10 @@ class Params:
class BpeVocab:
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
self.vocab = self.bpe_tokenizer["model"]["vocab"]
try:
self.vocab = self.bpe_tokenizer["model"]["vocab"]
except KeyError:
self.vocab = self.bpe_tokenizer
added_tokens: dict[str, int]
if fname_added_tokens is not None:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.

View file

@ -4,34 +4,35 @@ This example demonstrates a simple HTTP API server and a simple web front end to
Command line options:
- `--threads N`, `-t N`: Set the number of threads to use during generation.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa`: Attempt optimizations that help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default examples/server/public)
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
- `--threads N`, `-t N`: Set the number of threads to use during generation.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa`: Attempt optimizations that help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default examples/server/public)
- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s.
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
## Build
server is build alongside everything else from the root of the project
@ -52,21 +53,23 @@ server is build alongside everything else from the root of the project
To get started right away, run the following command, making sure to use the correct path for the model you have:
### Unix-based systems (Linux, macOS, etc.):
### Unix-based systems (Linux, macOS, etc.)
```bash
./server -m models/7B/ggml-model.gguf -c 2048
```
### Windows:
### Windows
```powershell
server.exe -m models\7B\ggml-model.gguf -c 2048
```
The above command will start a server that by default listens on `127.0.0.1:8080`.
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
### Docker:
### Docker
```bash
docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
@ -120,12 +123,13 @@ node index.js
```
## API Endpoints
- **GET** `/health`: Returns the current state of the server:
- `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
- **GET** `/health`: Returns the current state of the server:
- `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
*Options:*
@ -189,14 +193,13 @@ node index.js
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
### Result JSON:
* Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
### Result JSON
- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
```
```json
{
"content": "<the token selected by the model>",
"probs": [
@ -212,6 +215,7 @@ node index.js
]
},
```
Notice that each `probs` is an array of length `n_probs`.
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
@ -228,7 +232,7 @@ Notice that each `probs` is an array of length `n_probs`.
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
- **POST** `/tokenize`: Tokenize a given text.
- **POST** `/tokenize`: Tokenize a given text.
*Options:*
@ -236,13 +240,13 @@ Notice that each `probs` is an array of length `n_probs`.
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
- **POST** `/detokenize`: Convert tokens to text.
- **POST** `/detokenize`: Convert tokens to text.
*Options:*
`tokens`: Set the tokens to detokenize.
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
*Options:*
@ -250,7 +254,7 @@ Notice that each `probs` is an array of length `n_probs`.
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
*Options:*
@ -260,9 +264,9 @@ Notice that each `probs` is an array of length `n_probs`.
It also accepts all the options of `/completion` except `stream` and `prompt`.
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
*Options:*
@ -290,6 +294,7 @@ Notice that each `probs` is an array of length `n_probs`.
print(completion.choices[0].message)
```
... or raw HTTP requests:
```shell
@ -311,6 +316,40 @@ Notice that each `probs` is an array of length `n_probs`.
}'
```
- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.
*Options:*
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
*Examples:*
- input as string
```shell
curl http://localhost:8080/v1/embeddings \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"input": "hello",
"model":"GPT-4",
"encoding_format": "float"
}'
```
- `input` as string array
```shell
curl http://localhost:8080/v1/embeddings \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"input": ["hello", "world"],
"model":"GPT-4",
"encoding_format": "float"
}'
```
## More examples
### Change system prompt on runtime
@ -362,6 +401,7 @@ python api_like_OAI.py
```
After running the API server, you can use it in Python by setting the API base URL.
```python
openai.api_base = "http://<Your api-server IP>:port"
```

View file

@ -206,3 +206,18 @@ inline static std::vector<json> format_partial_response_oaicompat(const task_res
return std::vector<json>({ret});
}
inline static json format_embeddings_response_oaicompat(const json &request, const json &embeddings)
{
json res =
json{
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"},
{"usage",
json{{"prompt_tokens", 0},
{"total_tokens", 0}}},
{"data", embeddings}
};
return res;
}

View file

@ -2929,6 +2929,66 @@ int main(int argc, char **argv)
return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
});
svr.Post("/v1/embeddings", [&llama](const httplib::Request &req, httplib::Response &res)
{
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
json prompt;
if (body.count("input") != 0)
{
prompt = body["input"];
// batch
if(prompt.is_array()) {
json data = json::array();
int i = 0;
for (const json &elem : prompt) {
const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, { {"prompt", elem}, { "n_predict", 0} }, false, true, -1);
// get the result
task_result result = llama.queue_results.recv(task_id);
llama.queue_results.remove_waiting_task_id(task_id);
json embedding = json{
{"embedding", json_value(result.result_json, "embedding", json::array())},
{"index", i++},
{"object", "embedding"}
};
data.push_back(embedding);
}
json result = format_embeddings_response_oaicompat(body, data);
return res.set_content(result.dump(), "application/json; charset=utf-8");
}
}
else
{
prompt = "";
}
// create and queue the task
const int task_id = llama.queue_tasks.get_new_id();
llama.queue_results.add_waiting_task_id(task_id);
llama.request_completion(task_id, { {"prompt", prompt}, { "n_predict", 0}}, false, true, -1);
// get the result
task_result result = llama.queue_results.recv(task_id);
llama.queue_results.remove_waiting_task_id(task_id);
json data = json::array({json{
{"embedding", json_value(result.result_json, "embedding", json::array())},
{"index", 0},
{"object", "embedding"}
}}
);
json root = format_embeddings_response_oaicompat(body, data);
// send the result
return res.set_content(root.dump(), "application/json; charset=utf-8");
});
// GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!?
// "Bus error: 10" - this is on macOS, it does not crash on Linux
//std::thread t2([&]()

View file

@ -791,7 +791,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free(*buffers[i]);
}
free(buffers);
free(*buffers);
return false;
}

View file

@ -373,6 +373,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
extern GGML_CALL int ggml_backend_vk_reg_devices(void);
ggml_backend_vk_reg_devices();
#endif
#ifdef GGML_USE_KOMPUTE
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
ggml_backend_kompute_reg_devices();
#endif
}
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {

1990
ggml-kompute.cpp Normal file

File diff suppressed because it is too large Load diff

46
ggml-kompute.h Normal file
View file

@ -0,0 +1,46 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_vk_device {
int index;
int type; // same as VkPhysicalDeviceType
size_t heapSize;
const char * name;
const char * vendor;
int subgroupSize;
uint64_t bufferAlignment;
uint64_t maxAlloc;
};
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
bool ggml_vk_has_vulkan(void);
bool ggml_vk_has_device(void);
struct ggml_vk_device ggml_vk_current_device(void);
//
// backend API
//
// forward declaration
typedef struct ggml_backend * ggml_backend_t;
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
#ifdef __cplusplus
}
#endif

View file

@ -2375,6 +2375,16 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backen
UNUSED(buft);
}
GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
id<MTLDevice> device = ggml_backend_metal_get_device();
size_t max_size = device.maxBufferLength;
ggml_backend_metal_free_device();
return max_size;
UNUSED(buft);
}
GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
@ -2393,7 +2403,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,

View file

@ -2125,6 +2125,15 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
GGML_UNUSED(buffer_type);
}
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
static size_t max_size = -1;
if (max_size == (size_t)-1) {
ggml_cl_init();
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
}
return max_size;
}
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
//return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
return ggml_backend_is_cpu(backend);
@ -2136,7 +2145,7 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // TODO: return from device info
/* .get_max_size = */ ggml_backend_opencl_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL,

1
kompute Submodule

@ -0,0 +1 @@
Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306

102
kompute-shaders/common.comp Normal file
View file

@ -0,0 +1,102 @@
#extension GL_EXT_shader_16bit_storage: require
#extension GL_EXT_shader_8bit_storage: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
#extension GL_EXT_control_flow_attributes: enable
#extension GL_KHR_shader_subgroup_arithmetic : require
#extension GL_EXT_debug_printf : enable
#define QK4_0 32
#define QK4_1 32
#define GELU_COEF_A 0.044715
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
#define TWOPI_F 6.283185307179586f
#define QK_K 256
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
#define sizeof_block_q4_0 0x12
struct block_q4_0 {
float16_t d;
uint8_t qs[QK4_0 / 2];
};
mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
const float d2 = d1 / 256.f;
const float md = -8.f * xb.d;
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
const uint16_t mask1 = mask0 << 8;
mat4 reg;
for (int i=0;i<8;i++) {
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
}
return reg;
}
#define sizeof_block_q4_1 0x14
struct block_q4_1 {
float16_t d;
float16_t m;
uint8_t qs[QK4_1 / 2];
};
mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
const float d2 = d1 / 256.f;
const float m = xb.m;
const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
const uint16_t mask1 = mask0 << 8;
mat4 reg;
for (int i=0;i<8;i++) {
uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
}
return reg;
}
#define sizeof_block_q6_k 210
struct block_q6_k {
uint8_t ql[QK_K/2]; // quants, lower 4 bits
uint8_t qh[QK_K/4]; // quants, upper 2 bits
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
float16_t d; // super-block scale
};
mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
const float16_t d_all = xb.d;
const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
const uint qhIndex = 32*(il/8) + 16*(il&1);
float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
il = (il/2) & 3;
const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F);
const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f);
const float16_t ml = float16_t(d_all * sc * 32.f);
const float16_t dl = float16_t(d_all * sc * coef);
mat4 reg;
for (int i = 0; i < 16; ++i) {
const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2))
: ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4));
reg[i/4][i%4] = dl * q - ml;
}
return reg;
}
#define QK8_0 32
// struct block_q8_0 {
// float16_t d; // delta
// int8_t qs[QK8_0]; // quants
// };
#define sizeof_block_q8_0 34

View file

@ -0,0 +1,58 @@
#version 450
#include "common.comp"
layout(local_size_x = 1024) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb00;
int nb01;
int nb02;
int nb03;
int ne10;
int ne11;
int ne12;
int ne13;
int nb10;
int nb11;
int nb12;
int nb13;
int ne0;
int nb0;
int nb1;
int nb2;
int nb3;
//int offs; // TODO: needed for GGML_OP_ACC, see metal code
} pcs;
// general-purpose kernel for addition of two tensors
// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
// cons: not very efficient
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const uint i13 = i03 % pcs.ne13;
const uint i12 = i02 % pcs.ne12;
const uint i11 = i01 % pcs.ne11;
int offs = 0; // TMP (see above)
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4);
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4);
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
const uint i10 = i0 % pcs.ne10;
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
}
}

View file

@ -0,0 +1,25 @@
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
uint row;
} pcs;
void main() {
const uint baseIndex = gl_WorkGroupID.x * 4;
for (uint x = 0; x < 4; x++) {
const uint i = baseIndex + x;
out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
}
}

View file

@ -0,0 +1,52 @@
#version 450
#include "common.comp"
#define IN_TYPE float16_t
#define IN_TYPE_SIZE 2
#define OUT_TYPE float16_t
#define OUT_TYPE_SIZE 2
layout(local_size_x = 1024) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
int ne1;
int ne2;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]);
}
}

View file

@ -0,0 +1,52 @@
#version 450
#include "common.comp"
#define IN_TYPE float16_t
#define IN_TYPE_SIZE 2
#define OUT_TYPE float
#define OUT_TYPE_SIZE 4
layout(local_size_x = 1024) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
int ne1;
int ne2;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]);
}
}

View file

@ -0,0 +1,52 @@
#version 450
#include "common.comp"
#define IN_TYPE float
#define IN_TYPE_SIZE 4
#define OUT_TYPE float16_t
#define OUT_TYPE_SIZE 2
layout(local_size_x = 1024) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
int ne1;
int ne2;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]);
}
}

View file

@ -0,0 +1,52 @@
#version 450
#include "common.comp"
#define IN_TYPE float
#define IN_TYPE_SIZE 4
#define OUT_TYPE float
#define OUT_TYPE_SIZE 4
layout(local_size_x = 1024) in;
layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
layout (push_constant) uniform parameter {
uint inOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
int ne1;
int ne2;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
out_[dst_data+i00] = OUT_TYPE(in_[src]);
}
}

View file

@ -0,0 +1,30 @@
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
uint n_past;
int ne00;
int ne01;
} pcs;
void main() {
const uint i02 = gl_WorkGroupID.z;
const uint i01 = gl_WorkGroupID.y;
const uint i00 = gl_WorkGroupID.x;
const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
if (i00 > pcs.n_past + i01) {
out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
} else {
out_[index + pcs.outOff] = in_[index + pcs.inOff];
}
}

View file

@ -0,0 +1,22 @@
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
} pcs;
void main() {
const uint baseIndex = gl_WorkGroupID.x * 8;
for (uint x = 0; x < 8; x++) {
const uint i = baseIndex + x;
const float y = in_[i + pcs.inOff];
out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0)));
}
}

View file

@ -0,0 +1,17 @@
void main() {
const uint i = gl_WorkGroupID.x;
const int r = inB[i + pcs.inBOff];
int z = 0;
for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
const mat4 result = dequantize_block(inIndex, ind%NL);
for (uint j = 0; j < 4; ++j) {
for (uint k = 0; k < 4; ++k) {
const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
out_[outIndex] = result[j][k];
++z;
}
}
}
}

View file

@ -0,0 +1,31 @@
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb01;
int nb1;
} pcs;
void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
for (int j = 0; j < k; j++) {
out_[y + j] = inA[x + j];
}
}
void main() {
const uint i = gl_WorkGroupID.x;
const int r = inB[i + pcs.inBOff];
dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00);
}

View file

@ -0,0 +1,38 @@
#version 450
#include "common.comp"
#define NL 2
#define BYTES_FOR_TYPE 4 /*bytes for float*/
#define SIZE_OF_BLOCK sizeof_block_q4_0
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb01;
int nb1;
} pcs;
block_q4_0 get_unaligned_block_q4_0(uint index) {
block_q4_0 fres;
fres.d = u8BufToFloat16(inA, index);
[[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
fres.qs[it] = inA[index+2+it];
}
return fres;
}
mat4 dequantize_block(uint index, uint il) {
const block_q4_0 block = get_unaligned_block_q4_0(index);
return dequantize_q4_0(block, il);
}
#include "op_getrows.comp"

View file

@ -0,0 +1,39 @@
#version 450
#include "common.comp"
#define NL 2
#define BYTES_FOR_TYPE 4 /*bytes for float*/
#define SIZE_OF_BLOCK sizeof_block_q4_1
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb01;
int nb1;
} pcs;
block_q4_1 get_unaligned_block_q4_1(uint index) {
block_q4_1 fres;
fres.d = u8BufToFloat16(inA, index);
fres.m = u8BufToFloat16(inA, index+2);
[[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
fres.qs[it] = inA[index+4+it];
}
return fres;
}
mat4 dequantize_block(uint index, uint il) {
const block_q4_1 block = get_unaligned_block_q4_1(index);
return dequantize_q4_1(block, il);
}
#include "op_getrows.comp"

View file

@ -0,0 +1,44 @@
#version 450
#include "common.comp"
#define NL 16
#define BYTES_FOR_TYPE 4 /*bytes for float*/
#define SIZE_OF_BLOCK sizeof_block_q6_k
layout(local_size_x = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { int inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb01;
int nb1;
} pcs;
block_q6_k get_unaligned_block_q6_k(uint index) {
block_q6_k fres;
[[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
fres.ql[it] = inA[index + it];
}
[[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
fres.qh[it] = inA[index + QK_K/2 + it];
}
[[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
}
fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
return fres;
}
mat4 dequantize_block(uint index, uint il) {
const block_q6_k block = get_unaligned_block_q6_k(index);
return dequantize_q6_k(block, il);
}
#include "op_getrows.comp"

View file

@ -0,0 +1,52 @@
#version 450
#include "common.comp"
layout(local_size_x = 1024) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int nb00;
int nb01;
int nb02;
int nb03;
int ne10;
int ne11;
int ne12;
int ne13;
int nb10;
int nb11;
int nb12;
int nb13;
int ne0;
int nb0;
int nb1;
int nb2;
int nb3;
} pcs;
void main() {
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const uint i13 = i03 % pcs.ne13;
const uint i12 = i02 % pcs.ne12;
const uint i11 = i01 % pcs.ne11;
uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4);
uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4);
uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4);
for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
const uint i10 = i0 % pcs.ne10;
out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10];
}
}

View file

@ -0,0 +1,67 @@
#version 450
#include "common.comp"
#extension GL_KHR_shader_subgroup_arithmetic : require
layout(local_size_x_id = 0) in;
layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne01;
int ne02;
uint nb00;
uint nb01;
uint nb02;
int ne10;
int ne11;
int ne12;
uint nb10;
uint nb11;
uint nb12;
int ne0;
int ne1;
uint r2;
uint r3;
} pcs;
#define N_F16_F32 4
void main() {
const uint r0 = gl_WorkGroupID.x;
const uint rb = gl_WorkGroupID.y*N_F16_F32;
const uint im = gl_WorkGroupID.z;
const uint i12 = im%pcs.ne12;
const uint i13 = im/pcs.ne12;
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
for (uint row = 0; row < N_F16_F32; ++row) {
uint r1 = rb + row;
if (r1 >= pcs.ne11) {
break;
}
const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
float sumf = 0;
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
sumf += float(inA[x+i]) * float(inB[y+i]);
}
const float all_sum = subgroupAdd(sumf);
if (subgroupElect()) {
out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
}
}
}

View file

@ -0,0 +1,51 @@
#version 450
#include "common.comp"
#extension GL_KHR_shader_subgroup_arithmetic : require
#extension GL_EXT_debug_printf : enable
// device subgroup size
layout (local_size_x_id = 0) in;
layout(binding = 0) readonly buffer tensorInA { float inA[]; };
layout(binding = 1) readonly buffer tensorInB { float inB[]; };
layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
layout(push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne01;
int ne02;
int ne11;
int ne12;
uint nb01;
uint nb02;
uint nb11;
uint nb12;
uint nb1;
uint nb2;
}
pcs;
void main() {
uvec3 gid = gl_WorkGroupID;
uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
float sum = 0.0f;
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
sum += float(inA[x+i]) * float(inB[y+i]);
}
const float all_sum = subgroupAdd(sum);
if (subgroupElect()) {
out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
}
}

View file

@ -0,0 +1,33 @@
#version 450
#include "common.comp"
#define BLOCKS_IN_QUANT QK4_0
#define SIZE_OF_BLOCK sizeof_block_q4_0
#define N_ROWS 4
#include "op_mul_mv_q_n_pre.comp"
// The q4_0 version of this function
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
vec2 acc = vec2(0.0, 0.0);
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
float d = float(u8BufToFloat16(inA, index));
float sumy = 0.0f;
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
const float yl0 = inB[yb + i];
const float yl1 = inB[yb + i + 1];
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
sumy += yl0 + yl1 + yl8 + yl9;
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
}
return d * (sumy * -8.f + acc[0] + acc[1]);
}
#include "op_mul_mv_q_n.comp"

View file

@ -0,0 +1,35 @@
#version 450
#include "common.comp"
#define BLOCKS_IN_QUANT QK4_1
#define SIZE_OF_BLOCK sizeof_block_q4_1
#define N_ROWS 4
#include "op_mul_mv_q_n_pre.comp"
// The q4_1 version of this function
float block_q_n_dot_y(uint block_index, uint yb, uint il) {
vec2 acc = vec2(0.0, 0.0);
const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
float d = float(u8BufToFloat16(inA, index));
float m = float(u8BufToFloat16(inA, index+2));
float sumy = 0.0f;
for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
const float yl0 = inB[yb + i];
const float yl1 = inB[yb + i + 1];
const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
sumy += yl0 + yl1 + yl8 + yl9;
acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
}
return d * (acc[0] + acc[1]) + sumy * m;
}
#include "op_mul_mv_q_n.comp"

View file

@ -0,0 +1,94 @@
#version 450
#include "common.comp"
#define SIZE_OF_BLOCK sizeof_block_q6_k
layout(local_size_x_id = 0) in;
layout(local_size_y_id = 1) in;
layout(local_size_z = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne10;
int ne0;
int ne1;
int ne01;
int gqa;
} pcs;
void main() {
const uint8_t kmask1 = uint8_t(0x03);
const uint8_t kmask2 = uint8_t(0x0C);
const uint8_t kmask3 = uint8_t(0x30);
const uint8_t kmask4 = uint8_t(0xC0);
const uint nb = pcs.ne00/QK_K;
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint r2 = gl_WorkGroupID.z;
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
const uint x = row * nb + offset0; // Based from inA without base offset
const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
float sumf = 0;
// bits of invocation ID for gl_SubgroupSize=32:
// x x x x x
// 4 3 2 1 0
// ( tid ) ix
// ip ( il )
const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes
const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
const uint ip = tid/8; // first or second half of block (0 or 1)
const uint il = tid%8; // each half has 8 parts, one per scale
const uint n = 4; // 4 scales at a time (and 4 sums)
const uint l0 = n*il; // offset into half-block, 0..28
const uint is = 8*ip + l0/16; // 0, 1, 8, 9
const uint y_offset = 128*ip + l0;
const uint q_offset_l = 64*ip + l0;
const uint q_offset_h = 32*ip + l0;
for (uint i = ix; i < nb; i += block_stride) {
const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
const uint qlIndex = q_offset_l;
const uint q2Index = qlIndex + QK_K/8;
const uint qhIndex = q_offset_h;
const uint y = yy + i * QK_K + y_offset;
float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (uint l = 0; l < n; ++l) {
const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32);
sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32);
}
float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
}
const float tot = subgroupAdd(sumf);
if (subgroupElect()) {
out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
}
}

View file

@ -0,0 +1,73 @@
#version 450
#include "common.comp"
#include "op_mul_mv_q_n_pre.comp"
#define SIZE_OF_D 2
#define N_DST 4 // each SIMD group works on 4 rows
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
#define NB_Q8_0 8
void main() {
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
if (gl_SubgroupInvocationID > 31)
return;
const int nr = N_DST;
const int nsg = N_SIMDGROUP;
const int nw = N_SIMDWIDTH;
const int nb = pcs.ne00/QK8_0;
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint im = gl_WorkGroupID.z;
const uint first_row = (r0 * nsg + gl_SubgroupID) * nr;
const uint i12 = im%pcs.ne12;
const uint i13 = im/pcs.ne12;
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB
float yl[NB_Q8_0];
float sumf[N_DST]={0.f, 0.f, 0.f, 0.f};
const uint ix = gl_SubgroupInvocationID.x/4;
const uint il = gl_SubgroupInvocationID.x%4;
uint yb = y + ix * QK8_0 + NB_Q8_0*il;
// each thread in a SIMD group deals with NB_Q8_0 quants at a time
for (uint ib = ix; ib < nb; ib += nw/4) {
for (int i = 0; i < NB_Q8_0; ++i) {
yl[i] = inB[yb + i];
}
for (int row = 0; row < nr; row++) {
const uint block_offset = (ib+row*nb) * sizeof_block_q8_0;
float sumq = 0.f;
for (int iq = 0; iq < NB_Q8_0; ++iq) {
const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]);
sumq += qs_iq * yl[iq];
}
const float16_t d = u8BufToFloat16(inA, x + block_offset);
sumf[row] += sumq*d;
}
yb += NB_Q8_0 * nw;
}
for (int row = 0; row < nr; ++row) {
const float tot = subgroupAdd(sumf[row]);
if (subgroupElect() && first_row + row < pcs.ne01) {
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot;
}
}
}

View file

@ -0,0 +1,48 @@
void main() {
// NB: hack to make compatible with AMD GPUs that have a subgroup size of 64
if (gl_SubgroupInvocationID > 31)
return;
const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
const uint r0 = gl_WorkGroupID.x;
const uint r1 = gl_WorkGroupID.y;
const uint im = gl_WorkGroupID.z;
const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
const uint i12 = im%pcs.ne12;
const uint i13 = im/pcs.ne12;
const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
const uint x = offset0; // Based from inA without base offset
const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
const uint ix = gl_SubgroupInvocationID/2;
const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
uint yb = y + ix * BLOCKS_IN_QUANT + il;
//debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
// gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
// gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
for (uint ib = ix; ib < nb; ib += 16) {
for (int row = 0; row < N_ROWS; row++) {
const uint block_index = x + ib + row * nb;
sumf[row] += block_q_n_dot_y(block_index, yb, il);
}
yb += BLOCKS_IN_QUANT * 16;
}
for (int row = 0; row < N_ROWS; ++row) {
const float tot = subgroupAdd(sumf[row]);
if (first_row + row < pcs.ne01 && subgroupElect()) {
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
}
}
}

View file

@ -0,0 +1,22 @@
layout(local_size_x_id = 0) in;
layout(local_size_y = 1) in;
layout(local_size_z = 1) in;
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne01;
int ne02;
int ne10;
int ne12;
int ne0;
int ne1;
uint r2;
uint r3;
} pcs;

View file

@ -0,0 +1,84 @@
#version 450
#include "common.comp"
layout(local_size_x = 256) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
uint ne00;
uint nb01;
float eps;
} pcs;
shared float sum[gl_WorkGroupSize.x];
void main() {
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
// MEAN
// parallel sum
sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
sum[gl_LocalInvocationID.x] += in_[x+i00];
}
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
// broadcast
if (gl_LocalInvocationID.x == 0) {
sum[0] /= float(pcs.ne00);
}
barrier();
memoryBarrierShared();
const float mean = sum[0];
// recenter
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
out_[y+i00] = in_[x+i00] - mean;
}
// VARIANCE
// parallel sum
sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
}
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
// broadcast
if (gl_LocalInvocationID.x == 0) {
sum[0] /= float(pcs.ne00);
}
barrier();
memoryBarrierShared();
const float variance = sum[0];
const float scale = 1.0f/sqrt(variance + pcs.eps);
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
out_[y+i00] *= scale;
}
}

View file

@ -0,0 +1,21 @@
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
} pcs;
void main() {
const uint baseIndex = gl_WorkGroupID.x * 4;
for (uint x = 0; x < 4; x++) {
const uint i = baseIndex + x;
out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
}
}

View file

@ -0,0 +1,53 @@
#version 450
#include "common.comp"
layout(local_size_x = 512) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
uint ne00;
uint nb01;
float eps;
} pcs;
shared float sum[gl_WorkGroupSize.x];
void main() {
const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
// parallel sum
sum[gl_LocalInvocationID.x] = 0.0;
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
}
// reduce
barrier();
memoryBarrierShared();
[[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
if (gl_LocalInvocationID.x < i) {
sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
}
barrier();
memoryBarrierShared();
}
// broadcast
if (gl_LocalInvocationID.x == 0) {
sum[0] /= float(pcs.ne00);
}
barrier();
memoryBarrierShared();
const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
out_[y+i00] = in_[x+i00] * scale;
}
}

View file

@ -0,0 +1,73 @@
#version 450
#include "rope_common.comp"
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
void main() {
const uint i3 = gl_WorkGroupID.z;
const uint i2 = gl_WorkGroupID.y;
const uint i1 = gl_WorkGroupID.x;
const bool is_neox = (pcs.mode & 2) != 0;
float corr_dims[2];
rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
const int p = inB[pcs.inBOff + i2];
float theta = float(p);
if (!is_neox) {
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
float cos_theta, sin_theta;
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
theta *= theta_scale;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
const float x0 = float(inA[src]);
const float x1 = float(inA[src+1]);
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
}
} else {
const float inv_ndims = -1.f/pcs.n_dims;
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
const uint cur_rot = ic;
float cos_theta, sin_theta;
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
theta *= theta_scale;
const uint i0 = ic/2;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
const float x0 = float(inA[src]);
const float x1 = float(inA[src+pcs.n_dims/2]);
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
}
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
const uint i0 = ic;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
out_[dst_data + 0] = inA[src + 0];
out_[dst_data + 1] = inA[src + 1];
}
}
}

View file

@ -0,0 +1,73 @@
#version 450
#include "rope_common.comp"
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
void main() {
const uint i3 = gl_WorkGroupID.z;
const uint i2 = gl_WorkGroupID.y;
const uint i1 = gl_WorkGroupID.x;
const bool is_neox = (pcs.mode & 2) != 0;
float corr_dims[2];
rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
const int p = inB[pcs.inBOff + i2];
float theta = float(p);
if (!is_neox) {
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
float cos_theta, sin_theta;
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
theta *= theta_scale;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
const float x0 = inA[src];
const float x1 = inA[src+1];
out_[dst_data] = x0*cos_theta - x1*sin_theta;
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
}
} else {
const float inv_ndims = -1.f/pcs.n_dims;
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
const uint cur_rot = ic;
float cos_theta, sin_theta;
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
theta *= theta_scale;
const uint i0 = ic/2;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
const float x0 = inA[src];
const float x1 = inA[src+pcs.n_dims/2];
out_[dst_data] = x0*cos_theta - x1*sin_theta;
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
}
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
const uint i0 = ic;
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
out_[dst_data + 0] = inA[src + 0];
out_[dst_data + 1] = inA[src + 1];
}
}
}

View file

@ -0,0 +1,19 @@
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
float scale;
} pcs;
void main() {
const uint i = gl_WorkGroupID.x;
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
}

View file

@ -0,0 +1,23 @@
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
float scale;
} pcs;
void main() {
const uint baseIndex = gl_WorkGroupID.x * 8;
for (uint x = 0; x < 8; x++) {
const uint i = baseIndex + x;
out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
}
}

View file

@ -0,0 +1,22 @@
#version 450
#include "common.comp"
layout(local_size_x = 1) in;
layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inOff;
uint outOff;
} pcs;
void main() {
const uint baseIndex = gl_WorkGroupID.x * 4;
for (uint x = 0; x < 4; x++) {
const uint i = baseIndex + x;
const float y = in_[i + pcs.inOff];
out_[i + pcs.outOff] = y / (1.0 + exp(-y));
}
}

View file

@ -0,0 +1,56 @@
// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
#version 450
#include "common.comp"
layout(local_size_x_id = 0) in;
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
int ne00;
int ne01;
int ne02;
float scale;
int mask;
} pcs;
void main() {
if (gl_SubgroupInvocationID > 31)
return;
const uint i03 = gl_WorkGroupID.z;
const uint i02 = gl_WorkGroupID.y;
const uint i01 = gl_WorkGroupID.x;
const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
const uint psrc0 = extra_off + pcs.inAOff; // Based from inA
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
const uint pdst = extra_off + pcs.outOff; // Based from out_
// parallel max
float localMax = uintBitsToFloat(0xFF800000);
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
}
float max_ = subgroupMax(localMax);
// parallel sum
float localSum = 0.0f;
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
localSum += exp_psrc0;
out_[pdst + i00] = exp_psrc0;
}
const float sum = subgroupAdd(localSum);
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
out_[pdst + i00] /= sum;
}
}

View file

@ -0,0 +1,67 @@
#include "common.comp"
// TODO: use a local size of 32 or more (Metal uses 1024)
layout(local_size_x = 1) in;
layout (push_constant) uniform parameter {
uint inAOff;
uint inBOff;
uint outOff;
int n_dims;
int mode;
int n_orig_ctx;
float freq_base;
float freq_scale;
float ext_factor;
float attn_factor;
float beta_fast;
float beta_slow;
uint nb00;
uint nb01;
uint nb02;
uint nb03;
int ne0;
uint nb0;
uint nb1;
uint nb2;
uint nb3;
} pcs;
float rope_yarn_ramp(const float low, const float high, const float i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
}
// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
void rope_yarn(
float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
out float cos_theta, out float sin_theta
) {
// Get n-d rotational scaling corrected for extrapolation
float theta_interp = freq_scale * theta_extrap;
float theta = theta_interp;
if (ext_factor != 0.0f) {
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
}
cos_theta = cos(theta) * mscale;
sin_theta = sin(theta) * mscale;
}
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
}
void rope_yarn_corr_dims(
int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
) {
// start and end correction dims
dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
}

View file

@ -15,6 +15,8 @@
# include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
# include "ggml-sycl.h"
#elif defined(GGML_USE_KOMPUTE)
# include "ggml-kompute.h"
#endif
#ifdef GGML_USE_METAL
@ -1158,10 +1160,10 @@ struct llama_mlock {
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
"Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) const {
@ -1313,6 +1315,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#endif
if (buft == nullptr) {
@ -4107,7 +4114,7 @@ static bool llm_load_tensors(
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
try {
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
@ -4128,6 +4135,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
return 0;
}
#ifdef GGML_USE_KOMPUTE
if (params.n_gpu_layers > 0 && (
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|| !(
model.ftype == LLAMA_FTYPE_ALL_F32 ||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
)
)) {
// TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
params.n_gpu_layers = 0;
}
#endif
if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data
@ -10259,6 +10282,16 @@ struct llama_context * llama_new_context_with_model(
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) {
auto * backend = ggml_backend_kompute_init(model->main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {

View file

@ -49,7 +49,8 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 4
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif

View file

@ -370,12 +370,15 @@ struct test_case {
printf(" %s(%s): ", op_desc(out).c_str(), vars().c_str());
fflush(stdout);
// check if backends support op
// check if the backends support the ops
bool supported = true;
for (ggml_backend_t backend : {backend1, backend2}) {
if (!ggml_backend_supports_op(backend, out)) {
printf("not supported [%s] ", ggml_backend_name(backend));
supported = false;
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (!ggml_backend_supports_op(backend, t)) {
printf("not supported [%s] ", ggml_backend_name(backend));
supported = false;
break;
}
}
}
if (!supported) {
@ -626,6 +629,13 @@ struct test_unary : public test_case {
ggml_tensor * out = ggml_unary(ctx, in, op);
return out;
}
void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
// test extended range of values to check for NaNs in GELU
init_tensor_uniform(t, -150.f, 150.f);
}
}
};
// GGML_OP_GET_ROWS
@ -1066,18 +1076,24 @@ struct test_diag_mask_inf : public test_case {
struct test_soft_max : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
const float scale;
const bool mask;
std::string vars() override {
return VARS_TO_STR2(type, ne);
return VARS_TO_STR4(type, ne, scale, mask);
}
test_soft_max(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 10})
: type(type), ne(ne) {}
std::array<int64_t, 4> ne = {10, 10, 10, 10},
float scale = 1.0f,
bool mask = false)
: type(type), ne(ne), scale(scale), mask(mask) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * out = ggml_soft_max(ctx, a);
ggml_tensor * b = nullptr;
if (mask) { b = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]); }
ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale);
return out;
}
};
@ -1474,6 +1490,393 @@ struct test_moe : public test_case {
}
};
enum llm_norm_type {
LLM_NORM,
LLM_NORM_RMS,
};
struct llama_hparams {
uint32_t n_vocab;
uint32_t n_embd;
uint32_t n_head;
uint32_t n_head_kv;
static constexpr uint32_t n_layer = 1;
uint32_t n_rot;
uint32_t n_embd_head; // dimension of values (d_v)
uint32_t n_ff;
float f_norm_eps;
float f_norm_rms_eps;
// cparams
static constexpr uint32_t n_ctx = 512; // user-specified context size
static constexpr uint32_t n_orig_ctx = n_ctx;
// batch
int32_t n_tokens;
// llm_build_context
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
return n_embd_head * n_head_kv;
}
};
// LLM base class
struct test_llm : public test_case {
llama_hparams hp;
protected:
test_llm(llama_hparams hp)
: hp(std::move(hp)) {
}
public:
struct ggml_tensor * llm_build_norm(
struct ggml_context * ctx,
struct ggml_tensor * cur,
struct ggml_tensor * mw,
struct ggml_tensor * mb,
llm_norm_type type) {
switch (type) {
case LLM_NORM: cur = ggml_norm (ctx, cur, hp.f_norm_eps); break;
case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
}
cur = ggml_mul(ctx, cur, mw);
if (mb) {
cur = ggml_add(ctx, cur, mb);
}
return cur;
}
void llm_build_kv_store(
struct ggml_context * ctx,
struct ggml_tensor * k_l,
struct ggml_tensor * v_l,
struct ggml_tensor * k_cur,
struct ggml_tensor * v_cur) {
// compute the transposed [n_tokens, n_embd] V matrix
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
(ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
( hp.n_ctx)*ggml_element_size(v_l),
(hp.kv_head)*ggml_element_size(v_l));
// important: storing RoPE-ed version of K in the KV cache!
ggml_cpy(ctx, k_cur, k_cache_view);
ggml_cpy(ctx, v_cur_t, v_cache_view);
}
// if max_alibi_bias > 0 then apply ALiBi
struct ggml_tensor * llm_build_kqv(
struct ggml_context * ctx,
struct ggml_tensor * k_l,
struct ggml_tensor * v_l,
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
float kq_scale) {
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
struct ggml_tensor * k =
ggml_view_3d(ctx, k_l,
hp.n_embd_head, hp.n_kv, hp.n_head_kv,
ggml_row_size(k_l->type, hp.n_embd_gqa()),
ggml_row_size(k_l->type, hp.n_embd_head),
0);
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
// split cached v into n_head heads
struct ggml_tensor * v =
ggml_view_3d(ctx, v_l,
hp.n_kv, hp.n_embd_head, hp.n_head_kv,
ggml_element_size(v_l)*hp.n_ctx,
ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
0);
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
cur = ggml_mul_mat(ctx, wo, cur);
return cur;
}
void initialize_tensors(ggml_context * ctx) override {
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->type == GGML_TYPE_I32) {
// pos
std::vector<int> data(hp.n_tokens);
for (int i = 0; i < hp.n_tokens; i++) {
data[i] = rand() % hp.n_ctx;
}
ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
} else {
init_tensor_uniform(t);
}
}
}
};
// Llama
struct test_llama : public test_llm {
static constexpr float freq_base = 10000.0f;
static constexpr float freq_scale = 1.0f;
static constexpr float ext_factor = 0.0f;
static constexpr float attn_factor = 1.0f;
static constexpr float beta_fast = 32.0f;
static constexpr float beta_slow = 1.0f;
std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
return "LLAMA";
}
std::string vars() override {
auto n_tokens = hp.n_tokens;
return VARS_TO_STR1(n_tokens);
}
double max_nmse_err() override {
return 2e-3;
}
test_llama(int n_tokens = 1)
: test_llm({
/*n_vocab =*/ 32000,
/*n_embd =*/ 3200,
/*n_head =*/ 32,
/*n_head_kv =*/ 32,
/*n_rot =*/ 100,
/*n_embd_head =*/ 100,
/*n_ff =*/ 8640,
/*f_norm_eps =*/ 0.f,
/*f_norm_rms_eps =*/ 1e-5f,
/*n_tokens =*/ n_tokens,
}) {
}
ggml_tensor * build_graph(ggml_context * ctx) override {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
for (uint32_t il = 0; il < hp.n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
// norm
ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
// self-attention
{
ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
Qcur = ggml_rope_custom(
ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens), inp_pos,
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_custom(
ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
}
struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
// feed-forward network
ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
cur = ggml_mul_mat(ctx, ffn_gate, cur);
cur = ggml_silu(ctx, cur);
cur = ggml_mul(ctx, cur, tmp);
cur = ggml_mul_mat(ctx, ffn_down, cur);
cur = ggml_add(ctx, cur, ffn_inp);
// input for next layer
inpL = cur;
}
cur = inpL;
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
// lm_head
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
cur = ggml_mul_mat(ctx, output, cur);
return cur;
}
};
// Falcon
struct test_falcon : public test_llm {
static constexpr float freq_base = 10000.0f;
static constexpr float freq_scale = 1.0f;
static constexpr float ext_factor = 0.0f;
static constexpr float attn_factor = 1.0f;
static constexpr float beta_fast = 32.0f;
static constexpr float beta_slow = 1.0f;
std::string op_desc(ggml_tensor * t) override {
GGML_UNUSED(t);
return "FALCON";
}
std::string vars() override {
auto n_tokens = hp.n_tokens;
return VARS_TO_STR1(n_tokens);
}
double max_nmse_err() override {
return 2e-3;
}
test_falcon(int n_tokens = 1)
: test_llm({
/*n_vocab =*/ 32000,
/*n_embd =*/ 3200,
/*n_head =*/ 50,
/*n_head_kv =*/ 1,
/*n_rot =*/ 64,
/*n_embd_head =*/ 64,
/*n_ff =*/ 8640,
/*f_norm_eps =*/ 1e-5f,
/*f_norm_rms_eps =*/ 0.f,
/*n_tokens =*/ n_tokens,
}) {
}
ggml_tensor * build_graph(ggml_context * ctx) override {
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
for (uint32_t il = 0; il < hp.n_layer; ++il) {
// norm
ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
// self-attention
{
cur = attn_norm;
ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
cur = ggml_mul_mat(ctx, wqkv, cur);
struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd, hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head, hp.n_tokens);
Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
// using mode = 2 for neox mode
Qcur = ggml_rope_custom(
ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_custom(
ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
}
struct ggml_tensor * ffn_inp = cur;
// feed forward
{
ggml_tensor * ffn_up = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
cur = attn_norm;
cur = ggml_mul_mat(ctx, ffn_up, cur);
cur = ggml_gelu(ctx, cur);
cur = ggml_mul_mat(ctx, ffn_down, cur);
}
cur = ggml_add(ctx, cur, ffn_inp);
cur = ggml_add(ctx, cur, inpL);
// input for next layer
inpL = cur;
}
cur = inpL;
ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
// lm_head
ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
cur = ggml_mul_mat(ctx, output, cur);
return cur;
}
};
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
std::vector<std::unique_ptr<test_case>> test_cases;
std::default_random_engine rng(0);
@ -1626,6 +2029,9 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
exponent <<= 1;
}
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512)); // llama 7B
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512)); // llama 13B
@ -1662,6 +2068,14 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
//test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
#endif
// these tests are disabled to save execution time, but they can be handy for debugging
#if 0
test_cases.emplace_back(new test_llama(1));
test_cases.emplace_back(new test_llama(2));
test_cases.emplace_back(new test_falcon(1));
test_cases.emplace_back(new test_falcon(2));
#endif
// run tests
if (mode == MODE_TEST) {
ggml_backend_t backend_cpu = ggml_backend_cpu_init();

View file

@ -1,3 +1,7 @@
#include "llama.h"
#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#endif
int main(void) {}