Merge branch 'ggerganov:master' into hk

This commit is contained in:
Henry Kroll III 2024-08-30 14:36:27 -08:00 committed by GitHub
commit cf59a8d50f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
115 changed files with 13092 additions and 8058 deletions

View file

@ -1,18 +1,16 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
@ -24,13 +22,12 @@ WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make -j$(nproc)
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc) && \
cp build/bin/* .
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -0,0 +1,44 @@
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
FROM cosdt/cann:$ASCEND_VERSION AS build
WORKDIR /app
COPY . .
RUN yum install -y gcc g++ cmake make
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
# find libascend_hal.so, because the drive hasn`t been mounted.
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
RUN echo "Building with static libs" && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
# TODO: use image with NNRT
FROM cosdt/cann:$ASCEND_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENV LC_ALL=C.utf8
ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
ENTRYPOINT ["/llama-cli" ]

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git
apt-get install -y build-essential git cmake
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
RUN make -j$(nproc) llama-cli
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-cli -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/llama-cli /llama-cli
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-cli /llama-cli
ENTRYPOINT [ "/llama-cli" ]

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1
ARG CUDA_VERSION=12.6.0
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
# Target the CUDA runtime image
@ -8,31 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
# CUDA architecture to build for (defaults to all supported archs)
ARG CUDA_DOCKER_ARCH=default
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev
apt-get install -y build-essential git cmake libcurl4-openssl-dev
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV GGML_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
# Use the default CUDA archs if not specified
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
fi && \
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
cmake --build build --config Release --target llama-server -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl
COPY --from=build /app/llama-server /llama-server
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
COPY --from=build /app/build/src/libllama.so /libllama.so
COPY --from=build /app/build/bin/llama-server /llama-server
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -26,6 +26,8 @@ RUN apt-get update && \
COPY --from=build /app/build/bin/llama-server /llama-server
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
ENV GGML_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
# Enable cURL
ENV LLAMA_CURL=1

View file

@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
rm -rf /app
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

View file

@ -21,6 +21,8 @@ RUN apt-get update && \
COPY --from=build /app/llama-server /llama-server
ENV LC_ALL=C.utf8
# Must be set to 0.0.0.0 so it can listen to requests from host machine
ENV LLAMA_ARG_HOST=0.0.0.0
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

2
.ecrc
View file

@ -1,5 +1,5 @@
{
"Exclude": ["^\\.gitmodules$"],
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
"Disable": {
"IndentSize": true
}

View file

@ -1,3 +1,6 @@
# TODO: there have been some issues with the workflow, so disabling for now
# https://github.com/ggerganov/llama.cpp/issues/7893
#
# Benchmark
name: Benchmark

View file

@ -96,21 +96,12 @@ jobs:
env:
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
- name: Build and push Docker image (versioned)
- name: Build and push Docker image (tagged + versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
uses: docker/build-push-action@v6
with:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
file: ${{ matrix.config.dockerfile }}

3
.gitignore vendored
View file

@ -129,3 +129,6 @@ poetry.toml
# Scripts
!/scripts/install-oneapi.bat
# Test models for lora adapters
/lora-tests

View file

@ -28,6 +28,7 @@
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
{
"name": "arm64-windows-msvc", "hidden": true,
@ -60,6 +61,8 @@
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
]
}

View file

@ -105,6 +105,8 @@ Typically finetunes of the base models below are supported as well.
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
@ -424,6 +426,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU |
| [CANN](./docs/build.md#cann) | Ascend NPU |
## Tools

View file

@ -13,6 +13,9 @@
# # with SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with VULKAN support
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>"
@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi
if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
fi
if [ ! -z ${GG_BUILD_SYCL} ]; then
@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi
if [ ! -z ${GG_BUILD_VULKAN} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
fi
## helpers
# download a file if it does not exist or if it is outdated
@ -107,7 +114,7 @@ function gg_run_ctest_debug {
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -138,7 +145,7 @@ function gg_run_ctest_release {
gg_check_build_requirements
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
}
# open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_open_llama_7b_v2 {
cd ${SRC}
@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
}
# pythia_2_8b
# requires: GG_BUILD_CUDA
function gg_run_pythia_2_8b {
cd ${SRC}
@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
@ -761,7 +766,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
test $ret -eq 0 && gg_run pythia_1_4b
else
test $ret -eq 0 && gg_run pythia_2_8b

View file

@ -77,6 +77,41 @@
using json = nlohmann::ordered_json;
//
// Environment variable utils
//
template<typename T>
static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
get_env(std::string name, T & target) {
char * value = std::getenv(name.c_str());
target = value ? std::string(value) : target;
}
template<typename T>
static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
get_env(std::string name, T & target) {
char * value = std::getenv(name.c_str());
target = value ? std::stoi(value) : target;
}
template<typename T>
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
get_env(std::string name, T & target) {
char * value = std::getenv(name.c_str());
target = value ? std::stof(value) : target;
}
template<typename T>
static typename std::enable_if<std::is_same<T, bool>::value, void>::type
get_env(std::string name, T & target) {
char * value = std::getenv(name.c_str());
if (value) {
std::string val(value);
target = val == "1" || val == "true";
}
}
//
// CPU utils
//
@ -110,8 +145,34 @@ int32_t cpu_get_num_physical_cores() {
if (result == 0) {
return num_physical_cores;
}
#elif defined(_WIN32)
//TODO: Implement
#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64
unsigned int n_threads_win = std::thread::hardware_concurrency();
unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
DWORD buffer_size = 0;
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
return default_threads;
}
}
std::vector<char> buffer(buffer_size);
if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
return default_threads;
}
int32_t num_physical_cores = 0;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
while (buffer_size > 0) {
if (info->Relationship == RelationProcessorCore) {
num_physical_cores += info->Processor.GroupCount;
}
buffer_size -= info->Size;
info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
}
return num_physical_cores > 0 ? num_physical_cores : default_threads;
#endif
unsigned int n_threads = std::thread::hardware_concurrency();
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@ -190,16 +251,61 @@ int32_t cpu_get_num_math() {
return cpu_get_num_physical_cores();
}
// Helper for setting process priority
#if defined(_WIN32)
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
DWORD p = NORMAL_PRIORITY_CLASS;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
}
if (!SetPriorityClass(GetCurrentProcess(), p)) {
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false;
}
return true;
}
#else // MacOS and POSIX
#include <sys/types.h>
#include <sys/resource.h>
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
int p = 0;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
case GGML_SCHED_PRIO_HIGH: p = -10; break;
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
}
if (!setpriority(PRIO_PROCESS, 0, p)) {
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false;
}
return true;
}
#endif
//
// CLI argument parsing
//
void gpt_params_handle_hf_token(gpt_params & params) {
if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
params.hf_token = std::getenv("HF_TOKEN");
}
}
void gpt_params_handle_model_default(gpt_params & params) {
if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
@ -222,6 +328,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
}
}
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
int32_t n_set = 0;
if (cpuparams.n_threads < 0) {
// Assuming everything about cpuparams is invalid
if (role_model != nullptr) {
cpuparams = *role_model;
} else {
cpuparams.n_threads = cpu_get_num_math();
}
}
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
if (cpuparams.cpumask[i]) {
n_set++;
}
}
if (n_set && n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues.
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
}
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
@ -241,13 +371,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
}
postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
gpt_params_handle_model_default(params);
gpt_params_handle_hf_token(params);
if (params.hf_token.empty()) {
get_env("HF_TOKEN", params.hf_token);
}
if (params.escape) {
string_process_escapes(params.prompt);
@ -267,6 +404,32 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
return true;
}
void gpt_params_parse_from_env(gpt_params & params) {
// we only care about server-related params for now
get_env("LLAMA_ARG_MODEL", params.model);
get_env("LLAMA_ARG_MODEL_URL", params.model_url);
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
get_env("LLAMA_ARG_BATCH", params.n_batch);
get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
get_env("LLAMA_ARG_HOST", params.hostname);
get_env("LLAMA_ARG_PORT", params.port);
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
const auto params_org = params; // the example can modify the default params
@ -285,6 +448,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return true;
}
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) {
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
return false;
}
size_t start_i;
size_t end_i;
if (dash_loc == 0) {
start_i = 0;
} else {
start_i = std::stoull(range.substr(0, dash_loc));
if (start_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "Start index out of bounds!\n");
return false;
}
}
if (dash_loc == range.length() - 1) {
end_i = GGML_MAX_N_THREADS - 1;
} else {
end_i = std::stoull(range.substr(dash_loc + 1));
if (end_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "End index out of bounds!\n");
return false;
}
}
for (size_t i = start_i; i <= end_i; i++) {
boolmask[i] = true;
}
return true;
}
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
// Discard potential 0x prefix
size_t start_i = 0;
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
start_i = 2;
}
size_t num_digits = mask.length() - start_i;
if (num_digits > 128) num_digits = 128;
size_t end_i = num_digits + start_i;
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
char c = mask.at(i);
int8_t id = c;
if ((c >= '0' && c <= '9')) {
id -= '0';
} else if (c >= 'a' && c <= 'f') {
id -= 'a' - 10;
} else if (c >= 'A' && c <= 'F') {
id -= 'A' - 10;
} else {
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
return false;
}
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
}
return true;
}
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@ -301,36 +537,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "-t" || arg == "--threads") {
CHECK_ARG
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
params.cpuparams.n_threads = std::stoi(argv[i]);
if (params.cpuparams.n_threads <= 0) {
params.cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-C" || arg == "--cpu-mask") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
return true;
}
if (arg == "-Cr" || arg == "--cpu-range") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
return true;
}
if (arg == "--prio") {
CHECK_ARG
params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict") {
CHECK_ARG
params.cpuparams.strict_cpu = std::stoul(argv[i]);
return true;
}
if (arg == "--poll") {
CHECK_ARG
params.cpuparams.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-tb" || arg == "--threads-batch") {
CHECK_ARG
params.n_threads_batch = std::stoi(argv[i]);
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.cpuparams_batch.n_threads <= 0) {
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Cb" || arg == "--cpu-mask-batch") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "-Crb" || arg == "--cpu-range_batch") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch") {
CHECK_ARG
params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch") {
params.cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch") {
CHECK_ARG
params.cpuparams_batch.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-td" || arg == "--threads-draft") {
CHECK_ARG
params.n_threads_draft = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency();
params.draft_cpuparams.n_threads = std::stoi(argv[i]);
if (params.draft_cpuparams.n_threads <= 0) {
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Cd" || arg == "--cpu-mask-draft") {
CHECK_ARG
std::string mask = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "-Crd" || arg == "--cpu-range-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "--prio-draft") {
CHECK_ARG
params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-draft") {
params.draft_cpuparams.strict_cpu = true;
return true;
}
if (arg == "--poll-draft") {
CHECK_ARG
params.draft_cpuparams.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-tbd" || arg == "--threads-batch-draft") {
CHECK_ARG
params.n_threads_batch_draft = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency();
params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.draft_cpuparams_batch.n_threads <= 0) {
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch-draft") {
params.draft_cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-p" || arg == "--prompt") {
CHECK_ARG
params.prompt = argv[i];
@ -830,7 +1172,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
CHECK_ARG
params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
@ -1420,11 +1762,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
#ifndef GGML_USE_OPENMP
// these options are available only with the internal threadpool
options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"});
options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"});
options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>",
"Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
#endif // GGML_USE_OPENMP
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
@ -1698,7 +2069,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
@ -1730,11 +2100,17 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os;
os << "system_info: n_threads = " << params.n_threads;
if (params.n_threads_batch != -1) {
os << " (n_threads_batch = " << params.n_threads_batch << ")";
os << "system_info: n_threads = " << params.cpuparams.n_threads;
if (params.cpuparams_batch.n_threads != -1) {
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
}
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64
DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
#else
os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
#endif
return os.str();
}
@ -1786,13 +2162,19 @@ std::string string_get_sortable_timestamp() {
void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
while ((pos = s.find(search, pos)) != std::string::npos) {
s.replace(pos, search.length(), replace);
pos += replace.length();
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
void string_process_escapes(std::string & input) {
@ -2244,8 +2626,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
@ -2271,6 +2654,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
return cparams;
}
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
struct ggml_threadpool_params tpp;
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
if (params.mask_valid) {
std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
}
tpp.prio = params.priority;
tpp.poll = params.poll;
tpp.strict_cpu = params.strict_cpu;
return tpp;
}
#ifdef LLAMA_USE_CURL
static bool starts_with(const std::string & str, const std::string & prefix) {
@ -2709,12 +3108,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
return text;
}
bool llama_should_add_bos_token(const llama_model * model) {
const int add_bos = llama_add_bos_token(model);
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
}
//
// Chat template utils
//
@ -3266,7 +3659,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);

View file

@ -67,13 +67,18 @@ enum dimre_method {
DIMRE_METHOD_MEAN,
};
struct cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -100,6 +105,11 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
@ -205,7 +215,7 @@ struct gpt_params {
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1";
std::string public_path = "";
@ -268,7 +278,7 @@ struct gpt_params {
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
};
void gpt_params_handle_hf_token(gpt_params & params);
void gpt_params_parse_from_env(gpt_params & params);
void gpt_params_handle_model_default(gpt_params & params);
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
@ -278,6 +288,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);
//
// String utils
//
@ -329,7 +344,8 @@ struct llama_init_result {
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
@ -381,10 +397,6 @@ std::string llama_detokenize(
const std::vector<llama_token> & tokens,
bool special = true);
// Uses the value from the model metadata if possible, otherwise
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
//
// Chat template utils
//

File diff suppressed because it is too large Load diff

View file

@ -63,6 +63,7 @@ class Model:
model_name: str | None
metadata_override: Path | None
dir_model_card: Path
is_lora: bool
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
@ -70,7 +71,7 @@ class Model:
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
@ -92,6 +93,7 @@ class Model:
self.metadata_override = metadata_override
self.model_name = model_name
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
if self.ftype == gguf.LlamaFileType.GUESSED:
@ -295,6 +297,7 @@ class Model:
gguf.MODEL_TENSOR.FFN_GATE_INP,
gguf.MODEL_TENSOR.POS_EMBD,
gguf.MODEL_TENSOR.TOKEN_TYPES,
gguf.MODEL_TENSOR.SSM_CONV1D,
)
)
or not name.endswith(".weight")
@ -590,6 +593,15 @@ class Model:
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
res = "smollm"
if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
# ref: https://huggingface.co/bigscience/bloom
res = "bloom"
if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
# ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
res = "gpt3-finnish"
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
res = "exaone"
if res is None:
logger.warning("\n")
@ -893,7 +905,7 @@ class GPTNeoXModel(Model):
return tensors
@Model.register("BloomForCausalLM")
@Model.register("BloomForCausalLM", "BloomModel")
class BloomModel(Model):
model_arch = gguf.MODEL_ARCH.BLOOM
@ -1560,7 +1572,7 @@ class LlamaModel(Model):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
factor = rope_scaling.get("factor", 8.0)
@ -1583,6 +1595,7 @@ class LlamaModel(Model):
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
if not self.is_lora:
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
super().prepare_tensors()
@ -2130,6 +2143,7 @@ class Phi3MiniModel(Model):
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
if not self.is_lora:
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
@ -2702,7 +2716,7 @@ class StarCoder2Model(Model):
model_arch = gguf.MODEL_ARCH.STARCODER2
@Model.register("MambaForCausalLM", "MambaLMHeadModel")
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
class MambaModel(Model):
model_arch = gguf.MODEL_ARCH.MAMBA
@ -2733,7 +2747,10 @@ class MambaModel(Model):
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
use_dt_b_c_norm = False
# For falconmamba we do apply RMS norm on B / DT and C layers
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
use_dt_b_c_norm = True
# Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model
@ -2741,12 +2758,13 @@ class MambaModel(Model):
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_block_count(self.block_count)
self.gguf_writer.add_ssm_conv_kernel(d_conv)
self.gguf_writer.add_ssm_inner_size(d_inner)
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
self.gguf_writer.add_file_type(self.ftype)
_tok_embd = None
@ -2773,23 +2791,6 @@ class MambaModel(Model):
return [(new_name, data_torch)]
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
if bid is not None and new_name in (
self.format_tensor_name(
n, bid, ".weight" if name.endswith(".weight") else ""
)
for n in [
gguf.MODEL_TENSOR.SSM_CONV1D,
gguf.MODEL_TENSOR.SSM_X,
gguf.MODEL_TENSOR.SSM_DT,
gguf.MODEL_TENSOR.SSM_A,
gguf.MODEL_TENSOR.SSM_D,
]
):
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
@Model.register("CohereForCausalLM")
class CommandR2Model(Model):
@ -3734,8 +3735,121 @@ class ChatGLMModel(Model):
name = name.removeprefix("transformer.")
return [(self.map_tensor_name(name), data_torch)]
###### CONVERSION LOGIC ######
@Model.register("NemotronForCausalLM")
class NemotronModel(Model):
model_arch = gguf.MODEL_ARCH.NEMOTRON
def set_vocab(self):
self._set_vocab_sentencepiece()
self.gguf_writer.add_pad_token_id(0)
self.gguf_writer.add_unk_token_id(1)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
f_norm_eps = self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon", "norm_eps"])
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
# * Partial RoPE
rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"])
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
# * RopeScaling for Nemotron
if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
else:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
# model.layers.{l}.input_layernorm.weight
# model.layers.{l}.post_attention_layernorm.weight
# model.norm.weight
if name.endswith("norm.weight"):
data_torch = data_torch + 1
return [(self.map_tensor_name(name), data_torch)]
@Model.register("ExaoneForCausalLM")
class ExaoneModel(Model):
model_arch = gguf.MODEL_ARCH.EXAONE
def set_gguf_parameters(self):
hparams = self.hparams
assert (hparams["activation_function"] == "silu")
max_position_embeddings = hparams["max_position_embeddings"]
embed_dim = hparams["hidden_size"]
num_heads = hparams["num_attention_heads"]
num_kv_heads = hparams.get("num_key_value_heads", num_heads)
layer_norm_eps = hparams["layer_norm_epsilon"]
intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
num_layers = hparams["num_layers"]
# ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
# attention_dropout_rate = hparams["attention_dropout"]
# ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
# embed_dropout_rate = hparams["embed_dropout"]
self.gguf_writer.add_embedding_length(embed_dim)
self.gguf_writer.add_head_count(num_heads)
self.gguf_writer.add_head_count_kv(num_kv_heads)
self.gguf_writer.add_context_length(max_position_embeddings)
self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
self.gguf_writer.add_feed_forward_length(intermediate_size)
self.gguf_writer.add_block_count(num_layers)
self.gguf_writer.add_file_type(self.ftype)
if (rope_theta := self.hparams.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta)
rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
rotary_factor = rotary_factor if rotary_factor is not None else 1.0
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
if hparams.get("rope_scaling") is not None and "factor" in hparams["rope_scaling"]:
if hparams["rope_scaling"].get("type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
def prepare_tensors(self):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
factor = rope_scaling.get("factor", 8.0)
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
assert low_freq_wavelen != high_freq_wavelen
rope_factors = []
for freq in freqs:
wavelen = 2 * math.pi / freq
if wavelen < high_freq_wavelen:
rope_factors.append(1)
elif wavelen > low_freq_wavelen:
rope_factors.append(factor)
else:
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
if not self.is_lora:
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
super().prepare_tensors()
###### CONVERSION LOGIC ######
# tree of lazy tensors
class LazyTorchTensor(gguf.LazyBase):

View file

@ -94,6 +94,9 @@ models = [
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
]

View file

@ -116,7 +116,7 @@ class Tensor:
assert quant is not None, 'Unknown tensor type'
(blksize, tysize) = quant
offset += 12
self.dtype= dtype
self.dtype= gguf.GGMLQuantizationType(dtype)
self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
offset += 4 * n_dims
self.name = bytes(data[offset:offset + name_len])

View file

@ -386,6 +386,7 @@ if __name__ == '__main__':
dry_run=args.dry_run,
dir_lora_model=dir_lora,
lora_alpha=alpha,
is_lora=True,
)
logger.info("Exporting model...")

259
docs/backend/CANN.md Normal file
View file

@ -0,0 +1,259 @@
# llama.cpp for CANN
- [Background](#background)
- [News](#news)
- [OS](#os)
- [Hardware](#hardware)
- [Model Supports](#model-supports)
- [DataType Supports](#datatype-supports)
- [Docker](#docker)
- [Linux](#linux)
- [TODO](#todo)
## Background
**Ascend NPU** is a range of AI processors using Neural Processing Unit. It will efficiently handle matrix-matrix multiplication, dot-product and scalars.
**CANN** (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for AI scenarios, providing support for multiple AI frameworks on the top and serving AI processors and programming at the bottom. It plays a crucial role in bridging the gap between upper and lower layers, and is a key platform for improving the computing efficiency of Ascend AI processors. Meanwhile, it offers a highly efficient and easy-to-use programming interface for diverse application scenarios, allowing users to rapidly build AI applications and services based on the Ascend platform.
**Llama.cpp + CANN**
The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
## News
- 2024.8
- Support `Q4_0` and `Q8_0` data type for Ascend NPU.
- 2024.7
- Create CANN backend for Ascend NPU.
## OS
| OS | Status | Verified |
|:-------:|:-------:|:----------------------------------------------:|
| Linux | Support | Ubuntu 22.04, OpenEuler22.03 |
## Hardware
### Ascend NPU
**Verified devices**
| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
*Notes:*
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the upper table.
## Model Supports
| Model Name | FP16 | Q8_0 | Q4_0 |
|:----------------------------|:-----:|:----:|:----:|
| AquilaChat2-7B | √ | √ | √ |
| Baichuan-7b | √ | √ | √ |
| Baichuan2-7B-Chat | √ | √ | √ |
| bitnet_b1_58-large | √ | √ | √ |
| bloom-560m | √ | x | √ |
| bloomz-alpaca-560m | √ | x | √ |
| c4ai-command-r-35B-v01 | x | x | x |
| chatglm3-6B | x | x | x |
| chinese-alpaca-2-1.3b | √ | √ | √ |
| CodeShell-7B | √ | √ | √ |
| deepseek-ai_deepseek-coder-1.3B-base | x | x | x |
| deepseek-ai_DeepSeek-V2-Lite | x | x | x |
| deepseek-coder-6.7B-instruct | x | x | x |
| DeepSeek-V2-Lite-64x1.5B | x | x | x |
| falcon-7b-instruct | √ | √ | √ |
| flan-t5-large | √ | √ | √ |
| gemma-2-9b-it | √ | √ | √ |
| glm-4-9B | x | x | x |
| gpt2 | √ | √ | √ |
| Gpt2-163M | √ | √ | √ |
| granite-3B-code-instruct | √ | √ | √ |
| GritLM-7B | √ | √ | √ |
| internlm2_5-7b-chat | √ | √ | √ |
| koala-7B-HF | √ | √ | √ |
| Llama-2-7b-chat-hf | √ | √ | √ |
| Llama-3-Smaug-8B | √ | √ | √ |
| Llama2-Chinese-7b-Chat | √ | √ | √ |
| Llama3-8B | √ | √ | √ |
| Llama3-8b-chinese | √ | √ | √ |
| mamba-130m-hf | √ | √ | √ |
| Mistral-7B-Instruct-v0.2 | √ | √ | √ |
| Mixtral-8x7B-Instruct-v0.1 | x | √ | √ |
| mpt-7B | √ | √ | √ |
| OLMo-1B-hf | √ | √ | √ |
| OpenELM-3B-Instruct | √ | √ | √ |
| Orion-14b-base | √ | √ | √ |
| phi1 | x | x | x |
| phi2 | x | x | x |
| Phi-3-mini-4k-instruct | √ | √ | √ |
| plamo-13b | √ | √ | √ |
| pythia-70M | x | x | x |
| Qwen-7B | √ | √ | √ |
| Qwen2-1.5B-Instruct | √ | x | √ |
| Refact-1_6B-fim | √ | √ | √ |
| SmolLM-135M | √ | √ | √ |
| stablelm-zephyr | x | x | x |
| stablelm-2-zephyr-1_6b | x | x | x |
| starcoderbase-1b | √ | √ | √ |
| starcoder2-3b | √ | √ | √ |
| vigogne-7b-chat | √ | √ | √ |
| xverse-7b-chat | √ | √ | √ |
| Yi-6b-Chat | √ | √ | √ |
## DataType Supports
| DataType | Status |
|:----------------------:|:-------:|
| FP16 | Support |
| Q8_0 | Support |
| Q4_0 | Support |
## Docker
### Build Images
You can get a image with llama.cpp in one command.
```sh
docker build -t llama-cpp-cann -f .devops/llama-cli-cann.Dockerfile .
```
### Run container
```sh
# Find all cards.
npu-smi info
# Select the cards that you want to use, make sure these cards are not used by someone.
# Following using cards of device0.
docker run --name llamacpp --device /dev/davinci0 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info -v /PATH_TO_YOUR_MODELS/:/app/models -it llama-cpp-cann -m /app/models/MODEL_PATH -ngl 32 -p "Building a website can be done in 10 simple steps:"
```
*Notes:*
- You may need to install Ascend Driver and firmware on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
## Linux
### I. Setup Environment
1. **Install Ascend Driver and firmware**
```sh
# create driver running user.
sudo groupadd -g HwHiAiUser
sudo useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash
sudo usermod -aG HwHiAiUser $USER
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
# and install driver.
sudo sh Ascend-hdk-910b-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
```
Once installed, run `npu-smi info` to check whether driver is installed successfully.
```sh
+-------------------------------------------------------------------------------------------+
| npu-smi 24.1.rc2 Version: 24.1.rc2 |
+----------------------+---------------+----------------------------------------------------+
| NPU Name | Health | Power(W) Temp(C) Hugepages-Usage(page)|
| Chip | Bus-Id | AICore(%) Memory-Usage(MB) HBM-Usage(MB) |
+======================+===============+====================================================+
| 2 xxx | OK | 64.4 51 15 / 15 |
| 0 | 0000:01:00.0 | 0 1873 / 15077 0 / 32768 |
+======================+===============+====================================================+
| 5 xxx | OK | 64.0 52 15 / 15 |
| 0 | 0000:81:00.0 | 0 1874 / 15077 0 / 32768 |
+======================+===============+====================================================+
| No running processes found in NPU 2 |
+======================+===============+====================================================+
| No running processes found in NPU 5 |
+======================+===============+====================================================+
```
2. **Install Ascend Firmware**
```sh
# download driver from https://www.hiascend.com/hardware/firmware-drivers/community according to your system
# and install driver.
sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
```
If the following messaage appers, firmware is installed successfully.
```sh
Firmware package installed successfully!
```
3. **Install CANN toolkit and kernels**
CANN toolkit and kernels can be obtained from the official [CANN Toolkit](https://www.hiascend.com/zh/developer/download/community/result?module=cann) page.
Please download the corresponding version that satified your system. The minimum version required is 8.0.RC2.alpha002 and here is the install command.
```sh
pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
sh Ascend-cann-toolkit_8.0.RC2.alpha002_linux-aarch64.run --install
sh Ascend-cann-kernels-910b_8.0.RC2.alpha002_linux.run --install
```
Set Ascend Variables:
```sh
echo "source ~/Ascend/ascend-toolkit/set_env.sh" >> ~/.bashrc
source ~/.bashrc
```
Upon a successful installation, CANN is enabled for the available ascend devices.
### II. Build llama.cpp
```sh
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
cmake --build build --config release
```
### III. Run the inference
1. **Retrieve and prepare model**
You can refer to the general [*Prepare and Quantize*](../../README.md#prepare-and-quantize) guide for model prepration.
**Notes**:
- CANN backend only supports FP16/Q4_0/Q8_0 models currently.
2. **Launch inference**
There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically choose the devices with the same backend.
| Device selection | Parameter |
|:----------------:|:--------------------------------------:|
| Single device | --split-mode none --main-gpu DEVICE_ID |
| Multiple devices | --split-mode layer (default) |
Examples:
- Use device 0:
```sh
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
```
- Use multiple devices:
```sh
./build/bin/llama-cli -m path_to_model -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
```
### **GitHub contribution**:
Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
## TODO
- Support more models and data types.

View file

@ -20,7 +20,7 @@
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
@ -28,10 +28,6 @@
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
## Recommended Release
The SYCL backend would be broken by some PRs due to no online CI.
@ -45,6 +41,10 @@ The following release is verified with good quality:
## News
- 2024.8
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
- 2024.5
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
- Arch Linux is verified successfully.
@ -196,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
- **Adding support to Nvidia GPUs**
@ -255,8 +255,6 @@ or
# Export relevant ENV variables
source /opt/intel/oneapi/setvars.sh
# Build LLAMA with MKL BLAS acceleration for intel GPU
# Option 1: Use FP32 (recommended for better performance in most cases)
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
@ -338,12 +336,12 @@ Choose one of following methods to run.
- Use device 0:
```sh
./examples/sycl/run_llama2.sh 0
./examples/sycl/run-llama2.sh 0
```
- Use multiple devices:
```sh
./examples/sycl/run_llama2.sh
./examples/sycl/run-llama2.sh
```
2. Command line

View file

@ -352,6 +352,31 @@ cmake --build build --config Release
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
```
### CANN
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
Go to `llama.cpp` directory and build using CMake.
```bash
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
cmake --build build --config release
```
You can test with:
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
```bash
llm_load_tensors: CANN buffer size = 13313.00 MiB
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
```
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
### Android
To read documentation for how to build on Android, [click here](./android.md)

View file

@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
The defaults are:
- `CUDA_VERSION` set to `11.7.1`
- `CUDA_DOCKER_ARCH` set to `all`
- `CUDA_VERSION` set to `12.6.0`
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
The resulting images, are essentially the same as the non-CUDA images:

View file

@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);

View file

@ -21,7 +21,7 @@
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct {
int32_t n_threads = 1;
int n_threads = 1;
int32_t n_iterations = 10;
};

View file

@ -271,7 +271,7 @@ struct tokenized_prompt {
size_t max_seq_len;
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
@ -486,7 +486,7 @@ int main(int argc, char ** argv) {
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_threads = params.cpuparams.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);

View file

@ -127,7 +127,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
}
static bool run(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

View file

@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
g_verbose = (params.verbosity == 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());

View file

@ -433,8 +433,8 @@ static void process_logits(
}
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
const int n_ctx = llama_n_ctx(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();

View file

@ -203,8 +203,8 @@ int main(int argc, char ** argv) {
LOG_TEE("\n");
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
}
const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
const bool add_bos = llama_add_bos_token(model);
GGML_ASSERT(!llama_add_eos_token(model));
LOG("add_bos: %d\n", add_bos);
std::vector<llama_token> embd_inp;

View file

@ -16,6 +16,7 @@
#include <sstream>
#include <string>
#include <vector>
#include <thread>
#include "ggml.h"
#include "llama.h"
@ -225,6 +226,9 @@ struct cmd_params {
std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
std::vector<std::string> cpu_mask;
std::vector<bool> cpu_strict;
std::vector<int> poll;
std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode;
@ -236,6 +240,8 @@ struct cmd_params {
std::vector<bool> embeddings;
ggml_numa_strategy numa;
int reps;
ggml_sched_priority prio;
int delay;
bool verbose;
output_formats output_format;
output_formats output_format_stderr;
@ -251,6 +257,9 @@ static const cmd_params cmd_params_defaults = {
/* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
/* cpu_mask */ {"0x0"},
/* cpu_strict */ {false},
/* poll */ {50},
/* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@ -262,6 +271,8 @@ static const cmd_params cmd_params_defaults = {
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* prio */ GGML_SCHED_PRIO_NORMAL,
/* delay */ 0,
/* verbose */ false,
/* output_format */ MARKDOWN,
/* output_format_stderr */ NONE,
@ -281,6 +292,9 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@ -292,6 +306,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@ -338,6 +354,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps;
params.numa = cmd_params_defaults.numa;
params.prio = cmd_params_defaults.prio;
params.delay = cmd_params_defaults.delay;
for (int i = 1; i < argc; i++) {
arg = argv[i];
@ -433,6 +451,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
} else if (arg == "--cpu-strict") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
} else if (arg == "--poll") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.poll.insert(params.poll.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
@ -541,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break;
}
params.reps = std::stoi(argv[i]);
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
} else if (arg == "--delay") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.delay = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
@ -585,6 +636,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
return params;
}
@ -598,6 +652,9 @@ struct cmd_params_instance {
ggml_type type_k;
ggml_type type_v;
int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
int n_gpu_layers;
std::string rpc_servers;
llama_split_mode split_mode;
@ -667,7 +724,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) {
for (const auto & nt : params.n_threads)
for (const auto & cm : params.cpu_mask)
for (const auto & cs : params.cpu_strict)
for (const auto & pl : params.poll) {
for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) {
continue;
@ -681,6 +741,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
@ -707,6 +770,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
@ -733,6 +799,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk,
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
@ -769,6 +838,9 @@ struct test {
int n_batch;
int n_ubatch;
int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
bool has_rpc;
ggml_type type_k;
ggml_type type_v;
@ -795,6 +867,9 @@ struct test {
n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads;
cpu_mask = inst.cpu_mask;
cpu_strict = inst.cpu_strict;
poll = inst.poll;
has_rpc = !inst.rpc_servers.empty();
type_k = inst.type_k;
type_v = inst.type_v;
@ -872,13 +947,14 @@ struct test {
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
"n_threads", "type_k", "type_v",
"n_threads", "cpu_mask", "cpu_strict", "poll",
"type_k", "type_v",
"n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns",
"avg_ts", "stddev_ts"
"avg_ts", "stddev_ts",
};
return fields;
}
@ -887,7 +963,7 @@ struct test {
static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
field == "n_threads" ||
field == "n_threads" || field == "poll" ||
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" ||
@ -896,6 +972,7 @@ struct test {
}
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "cpu_strict" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL;
}
@ -928,7 +1005,8 @@ struct test {
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@ -1067,7 +1145,7 @@ struct markdown_printer : public printer {
return -30;
}
if (field == "t/s") {
return 16;
return 20;
}
if (field == "size" || field == "params") {
return 10;
@ -1149,6 +1227,15 @@ struct markdown_printer : public printer {
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.emplace_back("n_threads");
}
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
fields.emplace_back("cpu_mask");
}
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
fields.emplace_back("cpu_strict");
}
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
fields.emplace_back("poll");
}
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch");
}
@ -1383,6 +1470,8 @@ int main(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);
set_process_priority(params.prio);
// initialize printer
std::unique_ptr<printer> p = create_printer(params.output_format);
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@ -1428,6 +1517,28 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx);
// cool off before the test
if (params.delay) {
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
}
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
exit(1);
}
tpp.strict_cpu = t.cpu_strict;
tpp.poll = t.poll;
tpp.prio = params.prio;
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool, NULL);
// warmup run
if (t.n_prompt > 0) {
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
@ -1466,6 +1577,8 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
llama_free(ctx);
ggml_threadpool_free(threadpool);
}
llama_free_model(lmodel);

View file

@ -71,8 +71,8 @@ actor LlamaContext {
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 2048
ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads)
ctx_params.n_threads = Int32(n_threads)
ctx_params.n_threads_batch = Int32(n_threads)
let context = llama_new_context_with_model(model, ctx_params)
guard let context else {

View file

@ -15,9 +15,9 @@ cd llama.cpp
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M

View file

@ -0,0 +1,107 @@
## MiniCPM-V 2.6
### Prepare models and code
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
Clone llama.cpp:
```bash
git clone git@github.com:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpmv-main
```
### Usage of MiniCPM-V 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version
./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build for Linux or Mac
```bash
make
make llama-minicpmv-cli
```
Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
### Video
Install FFmpeg
```
brew install ffmpeg
brew install pkg-config
```
### Android
#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.
[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```
Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```
Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```

View file

@ -20,6 +20,10 @@
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@ -81,6 +85,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length"
@ -211,13 +216,19 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
while ((pos = s.find(search, pos)) != std::string::npos) {
s.replace(pos, search.length(), replace);
pos += replace.length();
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@ -526,6 +537,7 @@ struct clip_ctx {
bool has_vision_encoder = false;
bool has_llava_projector = false;
bool has_minicpmv_projector = false;
int minicpmv_version = 2;
struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
@ -641,7 +653,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (ctx->has_minicpmv_projector) {
int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
if (ctx->minicpmv_version == 2) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 3) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);
}
@ -768,8 +785,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
@ -949,10 +966,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}
{ // attention
const int hidden_size = 4096;
int hidden_size = 4096;
const int d_head = 128;
const int n_head = hidden_size/d_head;
const int num_query = 96;
int n_head = hidden_size/d_head;
int num_query = 96;
if (ctx->minicpmv_version == 2) {
hidden_size = 4096;
n_head = hidden_size/d_head;
num_query = 96;
}
else if (ctx->minicpmv_version == 3) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@ -1091,7 +1118,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
}
clip_ctx * new_clip = new clip_ctx;
clip_ctx * new_clip = new clip_ctx{};
// update projector type
{
@ -1125,6 +1152,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
#endif
#ifdef GGML_USE_VULKAN
new_clip->backend = ggml_backend_vk_init(0);
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();
@ -1149,6 +1180,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
}
idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
if (idx != -1) {
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
}
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT(new_clip->has_vision_encoder);
@ -1587,7 +1623,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
}
}
inline float clip(float x, float lower, float upper) {
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
@ -1791,10 +1827,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
return refine_size;
}
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) {
@ -1910,10 +1942,12 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
if (clip_is_minicpmv(ctx)) {
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
if(clip_is_minicpmv(ctx)){
int max_slice_nums = 9;
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
res_imgs->size = 0;
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t i = 0; i < imgs.size(); ++i){
res_imgs->size += imgs[i].size();
}
res_imgs->data = new clip_image_f32[res_imgs->size];
@ -2146,8 +2180,13 @@ int clip_n_patches(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
if (ctx->minicpmv_version == 2) {
n_patches = 96;
}
else if (ctx->minicpmv_version == 3) {
n_patches = 64;
}
}
return n_patches;
}
@ -2282,6 +2321,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init();
}
const int pos_w = ctx->load_image_size->width/patch_size;
const int pos_h = ctx->load_image_size->height/patch_size;
{
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@ -2316,8 +2360,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
for (int i = 0; i < num_positions; i++) {
positions_data[i] = std::floor(70.0*i/num_positions);
int bucket_coords_h[70];
int bucket_coords_w[70];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
@ -2328,12 +2382,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init();
}
int pos_w = ctx->load_image_size->width/patch_size;
int pos_h = ctx->load_image_size->height/patch_size;
int embed_dim = 4096;
if (ctx->minicpmv_version == 2) {
embed_dim = 4096;
}
else if (ctx->minicpmv_version == 3) {
embed_dim = 3584;
}
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@ -2346,7 +2401,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
free(pos_embed_data);
}
} else {
}
else{
{
if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
@ -2548,13 +2604,21 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->vision_model.mm_3_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
if (ctx->minicpmv_version == 2) {
return 4096;
}
else if (ctx->minicpmv_version == 3) {
return 3584;
}
}
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}
bool clip_is_minicpmv(const struct clip_ctx * ctx) {
return ctx->has_minicpmv_projector;
int clip_is_minicpmv(const struct clip_ctx * ctx) {
if (ctx->has_minicpmv_projector) {
return ctx->minicpmv_version;
}
return 0;
}

View file

@ -85,7 +85,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
#ifdef __cplusplus
}

View file

@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;

View file

@ -256,7 +256,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;

View file

@ -134,7 +134,13 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
std::string system_prompt;
int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (has_minicpmv_projector == 2) {
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
}
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
@ -174,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params);
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
return NULL;
@ -210,10 +216,24 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
std::string user_prompt = prompt;
if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
if (has_minicpmv_projector == 2) {
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
}
else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt;
}
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
if (has_minicpmv_projector == 2) {
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
// generate the response
LOG_TEE("\n");

View file

@ -1,9 +1,416 @@
import argparse
# coding=utf-8
# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Siglip model. """
# Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
import os
import math
import warnings
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn.init import _calculate_fan_in_and_fan_out
from transformers.activations import ACT2FN
from transformers.modeling_utils import PreTrainedModel
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import (
logging,
)
from transformers.utils import logging
logger = logging.get_logger(__name__)
class SiglipVisionConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
intermediate_size (`int`, *optional*, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
num_channels (`int`, *optional*, defaults to 3):
Number of channels in the input images.
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 16):
The size (resolution) of each patch.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
Example:
```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel
>>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
>>> configuration = SiglipVisionConfig()
>>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
>>> model = SiglipVisionModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "siglip_vision_model"
def __init__(
self,
hidden_size=768,
intermediate_size=3072,
num_hidden_layers=12,
num_attention_heads=12,
num_channels=3,
image_size=224,
patch_size=16,
hidden_act="gelu_pytorch_tanh",
layer_norm_eps=1e-6,
attention_dropout=0.0,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
"google/siglip-base-patch16-224",
# See all SigLIP models at https://huggingface.co/models?filter=siglip
]
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
def _trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn(
"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.",
stacklevel=2,
)
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor.uniform_(2 * l - 1, 2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
if tensor.dtype in [torch.float16, torch.bfloat16]:
# The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
og_dtype = tensor.dtype
tensor = tensor.to(torch.float32)
tensor.erfinv_()
tensor = tensor.to(og_dtype)
else:
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.0))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
if tensor.dtype == torch.float16:
# The `clamp_` op is not (yet?) defined in float16+cpu
tensor = tensor.to(torch.float32)
tensor.clamp_(min=a, max=b)
tensor = tensor.to(torch.float16)
else:
tensor.clamp_(min=a, max=b)
def trunc_normal_tf_(
tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
):
"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \\leq \text{mean} \\leq b`.
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsquently scaled and shifted by the mean and std args.
Args:
tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution
std: the standard deviation of the normal distribution
a: the minimum cutoff value
b: the maximum cutoff value
"""
with torch.no_grad():
_trunc_normal_(tensor, 0, 1.0, a, b)
tensor.mul_(std).add_(mean)
def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
denom = fan_in
if mode == "fan_in":
denom = fan_in
elif mode == "fan_out":
denom = fan_out
elif mode == "fan_avg":
denom = (fan_in + fan_out) / 2
variance = scale / denom
if distribution == "truncated_normal":
# constant is stddev of standard normal truncated to (-2, 2)
trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
elif distribution == "normal":
with torch.no_grad():
tensor.normal_(std=math.sqrt(variance))
elif distribution == "uniform":
bound = math.sqrt(3 * variance)
with torch.no_grad():
tensor.uniform_(-bound, bound)
else:
raise ValueError(f"invalid distribution {distribution}")
def lecun_normal_(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
def default_flax_embed_init(tensor):
variance_scaling_(tensor, mode="fan_in", distribution="normal")
class SiglipVisionEmbeddings(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.image_size = config.image_size
self.patch_size = config.patch_size
self.patch_embedding = nn.Conv2d(
in_channels=config.num_channels,
out_channels=self.embed_dim,
kernel_size=self.patch_size,
stride=self.patch_size,
padding="valid",
)
self.num_patches_per_side = self.image_size // self.patch_size
self.num_patches = self.num_patches_per_side**2
self.num_positions = self.num_patches
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
class SiglipAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
def __init__(self, config):
super().__init__()
self.config = config
self.embed_dim = config.hidden_size
self.num_heads = config.num_attention_heads
self.head_dim = self.embed_dim // self.num_heads
if self.head_dim * self.num_heads != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
f" {self.num_heads})."
)
self.scale = self.head_dim**-0.5
self.dropout = config.attention_dropout
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
class SiglipMLP(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.activation_fn = ACT2FN[config.hidden_act]
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
class SiglipEncoderLayer(nn.Module):
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.embed_dim = config.hidden_size
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self.self_attn = (
SiglipAttention(config)
)
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
self.mlp = SiglipMLP(config)
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
class SiglipPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = SiglipVisionConfig
base_model_prefix = "siglip"
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, SiglipVisionEmbeddings):
width = self.config.hidden_size
nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
elif isinstance(module, nn.Embedding):
default_flax_embed_init(module.weight)
elif isinstance(module, SiglipAttention):
nn.init.normal_(module.q_proj.weight)
nn.init.normal_(module.k_proj.weight)
nn.init.normal_(module.v_proj.weight)
nn.init.normal_(module.out_proj.weight)
nn.init.zeros_(module.q_proj.bias)
nn.init.zeros_(module.k_proj.bias)
nn.init.zeros_(module.v_proj.bias)
nn.init.zeros_(module.out_proj.bias)
elif isinstance(module, SiglipMLP):
nn.init.normal_(module.fc1.weight)
nn.init.normal_(module.fc2.weight)
nn.init.normal_(module.fc1.bias, std=1e-6)
nn.init.normal_(module.fc2.bias, std=1e-6)
elif isinstance(module, (nn.Linear, nn.Conv2d)):
lecun_normal_(module.weight)
if module.bias is not None:
nn.init.zeros_(module.bias)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
SIGLIP_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SIGLIP_VISION_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
class SiglipEncoder(nn.Module):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`SiglipEncoderLayer`].
Args:
config: SiglipConfig
"""
def __init__(self, config: SiglipVisionConfig):
super().__init__()
self.config = config
self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
class SiglipVisionTransformer(SiglipPreTrainedModel):
config_class = SiglipVisionConfig
main_input_name = "pixel_values"
_supports_flash_attn_2 = True
def __init__(self, config: SiglipVisionConfig):
super().__init__(config)
self.config = config
embed_dim = config.hidden_size
self.embeddings = SiglipVisionEmbeddings(config)
self.encoder = SiglipEncoder(config)
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self) -> nn.Module:
return self.embeddings.patch_embedding
import argparse
import json
import re
import torch
import numpy as np
from gguf import *
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
@ -94,6 +501,7 @@ default_image_mean = [0.48145466, 0.4578275, 0.40821073]
default_image_std = [0.26862954, 0.26130258, 0.27577711]
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
# with proper
args = ap.parse_args()
@ -135,6 +543,15 @@ if args.use_f32:
# model = CLIPModel.from_pretrained(dir_model)
# processor = CLIPProcessor.from_pretrained(dir_model)
minicpmv_version = args.minicpmv_version
emb_dim = 4096
if minicpmv_version == 1:
emb_dim = 2304
elif minicpmv_version == 2:
emb_dim = 4096
elif minicpmv_version == 3:
emb_dim = 3584
default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
@ -144,8 +561,12 @@ default_vision_config = {
"num_hidden_layers": 27,
"patch_size": 14,
}
vision_config = Idefics2VisionConfig(**default_vision_config)
model = Idefics2VisionTransformer(vision_config)
if minicpmv_version == 3:
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
processor = None
# if model.attn_pool is not None:
@ -158,6 +579,7 @@ fname_middle = None
has_text_encoder = True
has_vision_encoder = True
has_minicpmv_projector = False
if args.text_only:
fname_middle = "text-"
has_vision_encoder = False
@ -165,6 +587,7 @@ elif args.minicpmv_projector is not None:
fname_middle = "mmproj-"
has_text_encoder = False
has_minicpmv_projector = True
minicpmv_version = 3
elif args.vision_only:
fname_middle = "vision-"
has_text_encoder = False
@ -189,6 +612,7 @@ elif has_minicpmv_projector:
fout.add_description("image encoder for MiniCPM-V")
# add projector type
fout.add_string("clip.projector_type", "resampler")
fout.add_int32("clip.minicpmv_version", minicpmv_version)
else:
fout.add_description("two-tower CLIP model")
@ -274,11 +698,11 @@ def _replace_name_resampler(s, v):
if re.match("resampler.pos_embed", s):
return {
s: v,
re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
}
if re.match("resampler.proj", s):
return {
re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(emb_dim, (70, 70))),
re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
}
if re.match("resampler.attn.in_proj_.*", s):

View file

@ -4,7 +4,7 @@ import torch
from transformers import AutoModel, AutoTokenizer
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model")
ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
args = ap.parse_args()
# find the model part that includes the the multimodal projector weights
@ -29,7 +29,6 @@ if len(clip_tensors) > 0:
f.write("{}\n")
config = model.llm.config
config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5"
config.auto_map = {
"AutoConfig": "configuration_minicpm.MiniCPMConfig",
"AutoModel": "modeling_minicpm.MiniCPMModel",
@ -40,7 +39,6 @@ config.auto_map = {
model.llm.save_pretrained(f"{args.model}/model")
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
tok.save_pretrained(f"{args.model}/model")
# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py")
print("Done!")
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")

View file

@ -221,6 +221,40 @@ int main(int argc, char ** argv) {
return 1;
}
LOG("%s: llama threadpool init = n_threads = %d\n",
__func__,
(int) params.cpuparams.n_threads
);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams);
set_process_priority(params.cpuparams.priority);
struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch);
if (!threadpool_batch) {
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
exit(1);
}
// Start the non-batch threadpool in the paused state
tpp.paused = true;
}
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
if (ctx_guidance) {
llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
@ -267,9 +301,9 @@ int main(int argc, char ** argv) {
}
}
const bool add_bos = llama_should_add_bos_token(model);
const bool add_bos = llama_add_bos_token(model);
if (!llama_model_has_encoder(model)) {
GGML_ASSERT(llama_add_eos_token(model) != 1);
GGML_ASSERT(!llama_add_eos_token(model));
}
LOG("add_bos: %d\n", add_bos);
@ -989,6 +1023,9 @@ int main(int argc, char ** argv) {
llama_sampling_free(ctx_sampling);
llama_backend_free();
ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch);
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS

View file

@ -340,8 +340,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -480,8 +480,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
@ -1733,8 +1733,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);

View file

@ -34,7 +34,7 @@ Run the quantized model:
```bash
# start inference on a gguf model
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
./llama-cli -m ./models/mymodel/ggml-model-Q4_K_M.gguf -cnv -p "You are a helpful assistant"
```
When running the larger models, make sure you have enough disk space to store all the intermediate files.

View file

@ -104,7 +104,7 @@ static void usage(const char * executable) {
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
printf(" --keep-split: will generate quatized model in the same shards as input");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");

View file

@ -253,6 +253,8 @@ int main(int argc, char ** argv) {
chunks[i].tokens.clear();
}
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
// start loop, receive query and return top k similar chunks based on cosine similarity
std::string query;
while (true) {
@ -260,7 +262,6 @@ int main(int argc, char ** argv) {
std::getline(std::cin, query);
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
batch_add_seq(query_batch, query_tokens, 0);
std::vector<float> query_emb(n_embd, 0);
@ -293,6 +294,7 @@ int main(int argc, char ** argv) {
}
// clean up
llama_batch_free(query_batch);
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);

View file

@ -247,6 +247,51 @@ logging:
--log-append Don't truncate the old log file.
```
Available environment variables (if specified, these variables will override parameters specified in arguments):
- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
- `LLAMA_ARG_MODEL`: equivalent to `-m`
- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
- `LLAMA_ARG_THREADS`: equivalent to `-t`
- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
- `LLAMA_ARG_BATCH`: equivalent to `-b`
- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
- `LLAMA_ARG_HOST`: equivalent to `--host`
- `LLAMA_ARG_PORT`: equivalent to `--port`
Example usage of docker compose with environment variables:
```yml
services:
llamacpp-server:
image: ghcr.io/ggerganov/llama.cpp:server
ports:
- 8080:8080
volumes:
- ./models:/models
environment:
# alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
LLAMA_ARG_MODEL: /models/my_model.gguf
LLAMA_ARG_CTX_SIZE: 4096
LLAMA_ARG_N_PARALLEL: 2
LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0
LLAMA_ARG_PORT: 8080
```
## Build
@ -368,15 +413,16 @@ node index.js
## API Endpoints
### GET `/health`: Returns the current state of the server
### GET `/health`: Returns heath check result
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
- 500 -> `{"status": "error"}` if the model failed to load.
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available.
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available.
**Response format**
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
- HTTP status code 503
- Body: `{"error": {"code": 503, "message": "Loading model", "type": "unavailable_error"}}`
- Explanation: the model is still being loaded.
- HTTP status code 200
- Body: `{"status": "ok" }`
- Explanation: the model is successfully loaded and the server is ready.
### POST `/completion`: Given a `prompt`, it returns the predicted completion.
@ -639,10 +685,16 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
}'
```
### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
### GET `/slots`: Returns the current slots processing state
This endpoint can be disabled with `--no-slots`
If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
**Response format**
Example:
```json
[
{
@ -702,7 +754,13 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
]
```
### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled:
Possible values for `slot[i].state` are:
- `0`: SLOT_STATE_IDLE
- `1`: SLOT_STATE_PROCESSING
### GET `/metrics`: Prometheus compatible metrics exporter
This endpoint is only accessible if `--metrics` is set.
Available metrics:
- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
@ -767,6 +825,10 @@ Available metrics:
### GET `/lora-adapters`: Get list of all LoRA adapters
This endpoint returns the loaded LoRA adapters. You can add adapters using `--lora` when starting the server, for example: `--lora my_adapter_1.gguf --lora my_adapter_2.gguf ...`
By default, all adapters will be loaded with scale set to 1. To initialize all adapters scale to 0, add `--lora-init-without-apply`
If an adapter is disabled, the scale will be set to 0.
**Response format**

File diff suppressed because one or more lines are too long

View file

@ -15,6 +15,8 @@
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
// mime type for sending response
#define MIMETYPE_JSON "application/json; charset=utf-8"
// auto generated files (update with ./deps.sh)
#include "colorthemes.css.hpp"
@ -67,7 +69,6 @@ enum slot_command {
enum server_state {
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
SERVER_STATE_READY, // Server is ready and model is loaded
SERVER_STATE_ERROR // An error occurred, load_model failed
};
enum server_task_type {
@ -693,8 +694,8 @@ struct server_context {
n_ctx = llama_n_ctx(ctx);
add_bos_token = llama_should_add_bos_token(model);
has_eos_token = llama_add_eos_token(model) != 1;
add_bos_token = llama_add_bos_token(model);
has_eos_token = !llama_add_eos_token(model);
return true;
}
@ -1322,7 +1323,7 @@ struct server_context {
return json {
{"n_ctx", slot.n_ctx},
{"n_predict", slot.n_predict},
{"n_predict", slot.n_predict}, // Server configured n_predict
{"model", params.model_alias},
{"seed", slot.sparams.seed},
{"temperature", slot.sparams.temp},
@ -1344,7 +1345,7 @@ struct server_context {
{"mirostat_eta", slot.sparams.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl},
{"stop", slot.params.antiprompt},
{"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict
{"max_tokens", slot.params.n_predict}, // User configured n_predict
{"n_keep", slot.params.n_keep},
{"n_discard", slot.params.n_discard},
{"ignore_eos", ignore_eos},
@ -1852,6 +1853,8 @@ struct server_context {
llama_lora_adapters_apply(ctx, lora_adapters);
server_task_result result;
result.id = task.id;
result.stop = true;
result.error = false;
result.data = json{{ "success", true }};
queue_results.send(result);
} break;
@ -2036,7 +2039,7 @@ struct server_context {
slot.t_start_generation = 0;
if (slot.infill) {
const bool add_bos = llama_should_add_bos_token(model);
const bool add_bos = llama_add_bos_token(model);
bool suff_rm_leading_spc = true;
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
@ -2504,6 +2507,9 @@ int main(int argc, char ** argv) {
return 1;
}
// parse arguments from environment variables
gpt_params_parse_from_env(params);
// TODO: not great to use extern vars
server_log_json = params.log_json;
server_verbose = params.verbosity > 0;
@ -2528,8 +2534,8 @@ int main(int argc, char ** argv) {
});
LOG_INFO("system info", {
{"n_threads", params.n_threads},
{"n_threads_batch", params.n_threads_batch},
{"n_threads", params.cpuparams.n_threads},
{"n_threads_batch", params.cpuparams_batch.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});
@ -2554,19 +2560,19 @@ int main(int argc, char ** argv) {
svr->set_default_headers({{"Server", "llama.cpp"}});
// CORS preflight
svr->Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
// Access-Control-Allow-Origin is already set by middleware
res.set_header("Access-Control-Allow-Credentials", "true");
res.set_header("Access-Control-Allow-Methods", "POST");
res.set_header("Access-Control-Allow-Headers", "*");
return res.set_content("", "application/json; charset=utf-8");
return res.set_content("", "text/html"); // blank response, no data
});
svr->set_logger(log_server_request);
auto res_error = [](httplib::Response & res, json error_data) {
json final_response {{"error", error_data}};
res.set_content(final_response.dump(), "application/json; charset=utf-8");
res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
res.status = json_value(error_data, "code", 500);
};
@ -2596,11 +2602,6 @@ int main(int argc, char ** argv) {
svr->set_read_timeout (params.timeout_read);
svr->set_write_timeout(params.timeout_write);
if (!svr->bind_to_port(params.hostname, params.port)) {
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
return 1;
}
std::unordered_map<std::string, std::string> log_data;
log_data["hostname"] = params.hostname;
@ -2616,35 +2617,6 @@ int main(int argc, char ** argv) {
// Necessary similarity of prompt for slot selection
ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
// load the model
if (!ctx_server.load_model(params)) {
state.store(SERVER_STATE_ERROR);
return 1;
} else {
ctx_server.init();
state.store(SERVER_STATE_READY);
}
LOG_INFO("model loaded", {});
const auto model_meta = ctx_server.model_meta();
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (params.chat_template.empty()) {
if (!ctx_server.validate_model_chat_template()) {
LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
params.chat_template = "chatml";
}
}
// print sample chat example to make it clear which template is used
{
LOG_INFO("chat template", {
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
{"built_in", params.chat_template.empty()},
});
}
//
// Middlewares
//
@ -2688,8 +2660,6 @@ int main(int argc, char ** argv) {
}
// API key is invalid or not provided
// TODO: make another middleware for CORS related logic
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
LOG_WARNING("Unauthorized: Invalid API Key", {});
@ -2697,8 +2667,21 @@ int main(int argc, char ** argv) {
return false;
};
auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) {
server_state current_state = state.load();
if (current_state == SERVER_STATE_LOADING_MODEL) {
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
return false;
}
return true;
};
// register server middlewares
svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) {
svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!middleware_server_state(req, res)) {
return httplib::Server::HandlerResponse::Handled;
}
if (!middleware_validate_api_key(req, res)) {
return httplib::Server::HandlerResponse::Handled;
}
@ -2709,62 +2692,15 @@ int main(int argc, char ** argv) {
// Route handlers (or controllers)
//
const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
server_state current_state = state.load();
switch (current_state) {
case SERVER_STATE_READY:
{
// request slots data using task queue
server_task task;
task.id = ctx_server.queue_tasks.get_new_id();
task.type = SERVER_TASK_TYPE_METRICS;
task.id_target = -1;
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);
// get the result
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
const int n_idle_slots = result.data.at("idle");
const int n_processing_slots = result.data.at("processing");
json health = {
{"status", "ok"},
{"slots_idle", n_idle_slots},
{"slots_processing", n_processing_slots}
};
res.status = 200; // HTTP OK
if (params.endpoint_slots && req.has_param("include_slots")) {
health["slots"] = result.data.at("slots");
}
if (n_idle_slots == 0) {
health["status"] = "no slot available";
if (req.has_param("fail_on_no_slot")) {
res.status = 503; // HTTP Service Unavailable
}
}
const auto handle_health = [&](const httplib::Request &, httplib::Response & res) {
// error and loading states are handled by middleware
json health = {{"status", "ok"}};
res.set_content(health.dump(), "application/json");
break;
}
case SERVER_STATE_LOADING_MODEL:
{
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
} break;
case SERVER_STATE_ERROR:
{
res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
} break;
}
};
const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
if (!params.endpoint_slots) {
res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
@ -2782,13 +2718,22 @@ int main(int argc, char ** argv) {
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
res.set_content(result.data.at("slots").dump(), "application/json");
// optionally return "fail_on_no_slot" error
const int n_idle_slots = result.data.at("idle");
if (req.has_param("fail_on_no_slot")) {
if (n_idle_slots == 0) {
res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
return;
}
}
res.set_content(result.data.at("slots").dump(), MIMETYPE_JSON);
res.status = 200; // HTTP OK
};
const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
if (!params.endpoint_metrics) {
res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
@ -2913,7 +2858,7 @@ int main(int argc, char ** argv) {
if (result.error) {
res_error(res, result.data);
} else {
res.set_content(result.data.dump(), "application/json");
res.set_content(result.data.dump(), MIMETYPE_JSON);
}
};
@ -2943,7 +2888,7 @@ int main(int argc, char ** argv) {
if (result.error) {
res_error(res, result.data);
} else {
res.set_content(result.data.dump(), "application/json");
res.set_content(result.data.dump(), MIMETYPE_JSON);
}
};
@ -2963,13 +2908,11 @@ int main(int argc, char ** argv) {
if (result.error) {
res_error(res, result.data);
} else {
res.set_content(result.data.dump(), "application/json");
res.set_content(result.data.dump(), MIMETYPE_JSON);
}
};
const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
std::string id_slot_str = req.path_params.at("id_slot");
int id_slot;
@ -2993,7 +2936,7 @@ int main(int argc, char ** argv) {
}
};
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
const auto handle_props = [&ctx_server](const httplib::Request &, httplib::Response & res) {
std::string template_key = "tokenizer.chat_template", curr_tmpl;
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
if (tlen > 0) {
@ -3002,7 +2945,6 @@ int main(int argc, char ** argv) {
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
}
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = {
{ "system_prompt", ctx_server.system_prompt.c_str() },
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
@ -3010,7 +2952,7 @@ int main(int argc, char ** argv) {
{ "chat_template", curr_tmpl.c_str() }
};
res.set_content(data.dump(), "application/json; charset=utf-8");
res.set_content(data.dump(), MIMETYPE_JSON);
};
const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
@ -3019,8 +2961,6 @@ int main(int argc, char ** argv) {
return;
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = json::parse(req.body);
const int id_task = ctx_server.queue_tasks.get_new_id();
@ -3031,7 +2971,7 @@ int main(int argc, char ** argv) {
if (!json_value(data, "stream", false)) {
server_task_result result = ctx_server.queue_results.recv(id_task);
if (!result.error && result.stop) {
res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
} else {
res_error(res, result.data);
}
@ -3094,9 +3034,7 @@ int main(int argc, char ** argv) {
}
};
const auto handle_models = [&params, &model_meta](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const auto handle_models = [&params, &ctx_server](const httplib::Request &, httplib::Response & res) {
json models = {
{"object", "list"},
{"data", {
@ -3105,12 +3043,12 @@ int main(int argc, char ** argv) {
{"object", "model"},
{"created", std::time(0)},
{"owned_by", "llamacpp"},
{"meta", model_meta}
{"meta", ctx_server.model_meta()}
},
}}
};
res.set_content(models.dump(), "application/json; charset=utf-8");
res.set_content(models.dump(), MIMETYPE_JSON);
};
const auto handle_chat_completions = [&ctx_server, &params, &res_error](const httplib::Request & req, httplib::Response & res) {
@ -3118,8 +3056,6 @@ int main(int argc, char ** argv) {
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
const int id_task = ctx_server.queue_tasks.get_new_id();
@ -3134,7 +3070,7 @@ int main(int argc, char ** argv) {
if (!result.error && result.stop) {
json result_oai = format_final_response_oaicompat(data, result.data, completion_id);
res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
} else {
res_error(res, result.data);
}
@ -3196,8 +3132,6 @@ int main(int argc, char ** argv) {
return;
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = json::parse(req.body);
const int id_task = ctx_server.queue_tasks.get_new_id();
@ -3208,7 +3142,7 @@ int main(int argc, char ** argv) {
if (!json_value(data, "stream", false)) {
server_task_result result = ctx_server.queue_results.recv(id_task);
if (!result.error && result.stop) {
res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
} else {
res_error(res, result.data);
}
@ -3256,7 +3190,6 @@ int main(int argc, char ** argv) {
};
const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
std::vector<llama_token> tokens;
@ -3265,11 +3198,10 @@ int main(int argc, char ** argv) {
tokens = ctx_server.tokenize(body.at("content"), add_special);
}
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json; charset=utf-8");
return res.set_content(data.dump(), MIMETYPE_JSON);
};
const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
std::string content;
@ -3279,12 +3211,10 @@ int main(int argc, char ** argv) {
}
const json data = format_detokenized_response(content);
return res.set_content(data.dump(), "application/json; charset=utf-8");
return res.set_content(data.dump(), MIMETYPE_JSON);
};
const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const json body = json::parse(req.body);
bool is_openai = false;
@ -3330,11 +3260,10 @@ int main(int argc, char ** argv) {
json root = is_openai
? format_embeddings_response_oaicompat(body, responses)
: responses[0];
return res.set_content(root.dump(), "application/json; charset=utf-8");
return res.set_content(root.dump(), MIMETYPE_JSON);
};
const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
json result = json::array();
for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
auto & la = ctx_server.lora_adapters[i];
@ -3344,13 +3273,11 @@ int main(int argc, char ** argv) {
{"scale", la.scale},
});
}
res.set_content(result.dump(), "application/json");
res.set_content(result.dump(), MIMETYPE_JSON);
res.status = 200; // HTTP OK
};
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
const std::vector<json> body = json::parse(req.body);
int max_idx = ctx_server.lora_adapters.size();
@ -3378,7 +3305,7 @@ int main(int argc, char ** argv) {
server_task_result result = ctx_server.queue_results.recv(id_task);
ctx_server.queue_results.remove_waiting_task_id(id_task);
res.set_content(result.data.dump(), "application/json");
res.set_content(result.data.dump(), MIMETYPE_JSON);
res.status = 200; // HTTP OK
};
@ -3454,17 +3381,55 @@ int main(int argc, char ** argv) {
log_data["n_threads_http"] = std::to_string(params.n_threads_http);
svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
LOG_INFO("HTTP server listening", log_data);
// clean up function, to be called before exit
auto clean_up = [&svr]() {
svr->stop();
llama_backend_free();
};
// run the HTTP server in a thread - see comment below
std::thread t([&]() {
if (!svr->listen_after_bind()) {
state.store(SERVER_STATE_ERROR);
// bind HTTP listen port, run the HTTP server in a thread
if (!svr->bind_to_port(params.hostname, params.port)) {
LOG_ERROR("couldn't bind HTTP server socket", {
{"hostname", params.hostname},
{"port", params.port},
});
clean_up();
LOG_ERROR("exiting due to HTTP server error", {});
return 1;
}
std::thread t([&]() { svr->listen_after_bind(); });
svr->wait_until_ready();
return 0;
LOG_INFO("HTTP server is listening", log_data);
// load the model
LOG_INFO("loading model", log_data);
if (!ctx_server.load_model(params)) {
clean_up();
t.join();
LOG_ERROR("exiting due to model loading error", {});
return 1;
} else {
ctx_server.init();
state.store(SERVER_STATE_READY);
LOG_INFO("model loaded", {});
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (params.chat_template.empty()) {
if (!ctx_server.validate_model_chat_template()) {
LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
params.chat_template = "chatml";
}
}
// print sample chat example to make it clear which template is used
{
LOG_INFO("chat template", {
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
{"built_in", params.chat_template.empty()},
});
}
ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
@ -3483,6 +3448,8 @@ int main(int argc, char ** argv) {
shutdown_handler = [&](int) {
ctx_server.queue_tasks.terminate();
};
ctx_server.queue_tasks.start_loop();
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
@ -3498,12 +3465,8 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
ctx_server.queue_tasks.start_loop();
svr->stop();
clean_up();
t.join();
llama_backend_free();
return 0;
}

View file

@ -205,27 +205,20 @@ def step_start_server(context):
async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
match expecting_status:
case 'healthy':
await wait_for_health_status(context, context.base_url, 200, 'ok',
await wait_for_slots_status(context, context.base_url, 200,
timeout=30)
case 'ready' | 'idle':
await wait_for_health_status(context, context.base_url, 200, 'ok',
await wait_for_slots_status(context, context.base_url, 200,
timeout=30,
params={'fail_on_no_slot': 0, 'include_slots': 0},
params={'fail_on_no_slot': 1},
slots_idle=context.n_slots,
slots_processing=0,
expected_slots=[{'id': slot_id, 'state': 0}
for slot_id in
range(context.n_slots if context.n_slots else 1)])
slots_processing=0)
case 'busy':
await wait_for_health_status(context, context.base_url, 503,
'no slot available',
params={'fail_on_no_slot': 0, 'include_slots': 0},
await wait_for_slots_status(context, context.base_url, 503,
params={'fail_on_no_slot': 1},
slots_idle=0,
slots_processing=context.n_slots,
expected_slots=[{'id': slot_id, 'state': 1}
for slot_id in
range(context.n_slots if context.n_slots else 1)])
slots_processing=context.n_slots)
case _:
assert False, "unknown status"
@ -1187,17 +1180,15 @@ async def gather_tasks_results(context):
return n_completions
async def wait_for_health_status(context,
async def wait_for_slots_status(context,
base_url,
expected_http_status_code,
expected_health_status,
timeout=3,
params=None,
slots_idle=None,
slots_processing=None,
expected_slots=None):
slots_processing=None):
if context.debug:
print(f"Starting checking for health for expected_health_status={expected_health_status}")
print(f"Starting checking for health for expected_http_status_code={expected_http_status_code}")
interval = 0.5
counter = 0
if 'GITHUB_ACTIONS' in os.environ:
@ -1205,25 +1196,18 @@ async def wait_for_health_status(context,
async with aiohttp.ClientSession() as session:
while True:
async with await session.get(f'{base_url}/health', params=params) as health_response:
status_code = health_response.status
health = await health_response.json()
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
status_code = slots_response.status
slots = await slots_response.json()
if context.debug:
print(f"HEALTH - response for expected health status='{expected_health_status}' on "
f"'{base_url}/health'?{params} is {health}\n")
if (status_code == expected_http_status_code
and health['status'] == expected_health_status
and (slots_idle is None or health['slots_idle'] == slots_idle)
and (slots_processing is None or health['slots_processing'] == slots_processing)):
if expected_slots is not None:
assert_slots_status(health['slots'], expected_slots)
print(f"slots responses {slots}\n")
if status_code == 503 and status_code == expected_http_status_code:
return
if (status_code == expected_http_status_code
and health['status'] == expected_health_status
and (slots_idle is None or health['slots_idle'] == slots_idle)
and (slots_processing is None or health['slots_processing'] == slots_processing)):
if expected_slots is not None:
assert_slots_status(health['slots'], expected_slots)
if status_code == 200 and status_code == expected_http_status_code:
n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
if ((slots_idle is None or slots_idle == n_slots_idle)
and (slots_processing is None or slots_processing == n_slots_processing)):
return
await asyncio.sleep(interval)
@ -1238,7 +1222,7 @@ async def wait_for_health_status(context,
if n_completions > 0:
return
assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
assert False, f'slots check timeout exceeded {counter}s>={timeout}'
def assert_embeddings(embeddings):

View file

@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
// load the draft model
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
if (params.n_threads_draft > 0) {
params.n_threads = params.n_threads_draft;
if (params.draft_cpuparams.n_threads > 0) {
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
}
params.n_threads_batch = params.n_threads_batch_draft;
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
model_dft = llama_init_dft.model;
ctx_dft = llama_init_dft.context;

View file

@ -362,7 +362,7 @@ int main(int raw_argc, char ** raw_argv) {
prompt = stdin_buffer.str();
}
const bool model_wants_add_bos = llama_should_add_bos_token(model);
const bool model_wants_add_bos = llama_add_bos_token(model);
const bool add_bos = model_wants_add_bos && !no_bos;
const bool parse_special = !no_parse_special;

6
flake.lock generated
View file

@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1723175592,
"narHash": "sha256-M0xJ3FbDUc4fRZ84dPGx5VvgFsOzds77KiBMW/mMTnI=",
"lastModified": 1724224976,
"narHash": "sha256-Z/ELQhrSd7bMzTO8r7NZgi9g5emh+aRKoCdaAv5fiO0=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "5e0ca22929f3342b19569b21b2f3462f053e497b",
"rev": "c374d94f1536013ca8e92341b540eba4c22f9c62",
"type": "github"
},
"original": {

View file

@ -63,6 +63,7 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset of the tensor data for setting/getting data
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
@ -102,6 +103,7 @@ extern "C" {
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer

View file

@ -220,7 +220,7 @@
#include <stdio.h>
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
#define GGML_FILE_VERSION 2
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
@ -231,6 +231,8 @@
#define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 64
#define GGML_MAX_N_THREADS 512
#endif
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4
@ -453,6 +455,8 @@ extern "C" {
GGML_OP_SQR,
GGML_OP_SQRT,
GGML_OP_LOG,
GGML_OP_SIN,
GGML_OP_COS,
GGML_OP_SUM,
GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
@ -490,9 +494,11 @@ extern "C" {
GGML_OP_CLAMP,
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_ARANGE,
@ -624,6 +630,29 @@ extern "C" {
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
@ -631,6 +660,7 @@ extern "C" {
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
@ -969,6 +999,22 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return scalar
GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx,
@ -1566,34 +1612,49 @@ extern "C" {
float min,
float max);
// im2col
// converts data into a format that effectively results in a convolution when combined with matrix multiplication
GGML_API struct ggml_tensor * ggml_im2col(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1, // dilation dimension 1
bool is_2D,
enum ggml_type dst_type);
GGML_API struct ggml_tensor * ggml_im2col_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // gradient of im2col output
int64_t * ne, // shape of im2col input
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1, // dilation dimension 1
bool is_2D);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int p0, // padding
int d0); // dilation
@ -1602,29 +1663,29 @@ extern "C" {
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s,
int d);
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s, // stride
int d); // dilation
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int p0,
int d0);
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int p0, // padding
int d0); // dilation
GGML_API struct ggml_tensor * ggml_conv_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
// kernel size is a->ne[0] x a->ne[1]
@ -1686,6 +1747,18 @@ extern "C" {
float p0,
float p1);
GGML_API struct ggml_tensor * ggml_pool_2d_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * af, // "a"/input used in forward pass
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
float p0,
float p1);
// nearest interpolate
// multiplies ne0 and ne1 by scale factor
// used in stable-diffusion
@ -1760,7 +1833,8 @@ extern "C" {
struct ggml_tensor * v,
struct ggml_tensor * mask,
float scale,
float max_bias);
float max_bias,
float logit_softcap);
GGML_API void ggml_flash_attn_ext_set_prec(
struct ggml_tensor * a,
@ -1777,10 +1851,8 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_ssm_conv(
struct ggml_context * ctx,
struct ggml_tensor * s,
struct ggml_tensor * x,
struct ggml_tensor * c,
struct ggml_tensor * sq);
struct ggml_tensor * sx,
struct ggml_tensor * c);
GGML_API struct ggml_tensor * ggml_ssm_scan(
struct ggml_context * ctx,
@ -1789,8 +1861,7 @@ extern "C" {
struct ggml_tensor * dt,
struct ggml_tensor * A,
struct ggml_tensor * B,
struct ggml_tensor * C,
struct ggml_tensor * sq);
struct ggml_tensor * C);
// partition into non-overlapping windows with padding if needed
// example:
@ -2012,10 +2083,23 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

View file

@ -549,6 +549,13 @@ if (GGML_SYCL)
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
find_package(DNNL)
message("-- DNNL found:" ${DNNL_FOUND})
if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
else()
add_compile_definitions(GGML_SYCL_DNNL=0)
endif()
if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
@ -561,6 +568,9 @@ if (GGML_SYCL)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
endif()
endif()
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
endif()
endif()
if (GGML_RPC)
@ -1237,7 +1247,7 @@ endif()
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
add_compile_definitions(_GNU_SOURCE)
endif()

View file

@ -337,33 +337,18 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
}
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
}
else {
assert(false);
return 0;
}
}
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
}
else {
assert(false);
return 0;
}
}
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
UNUSED(quant_weights);
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
}
else {
assert(false);
return 0;
}
}
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {

View file

@ -723,6 +723,8 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
struct ggml_backend_cpu_context {
int n_threads;
ggml_threadpool_t threadpool;
void * work_data;
size_t work_size;
@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) {
@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) {
free(cpu_ctx->work_data);
@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
}
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->threadpool = NULL;
ctx->work_data = NULL;
ctx->work_size = 0;
ctx->abort_callback = NULL;
@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
ctx->n_threads = n_threads;
}
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
if (ctx->threadpool && ctx->threadpool != threadpool) {
// already had a different threadpool, pause/suspend it before switching
ggml_threadpool_pause(ctx->threadpool);
}
ctx->threadpool = threadpool;
}
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
@ -1018,10 +1033,6 @@ static bool ggml_is_view_op(enum ggml_op op) {
#define GGML_SCHED_MAX_BACKENDS 16
#endif
#ifndef GGML_SCHED_MAX_SPLITS
#define GGML_SCHED_MAX_SPLITS 2048
#endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
#endif
@ -1125,7 +1136,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
}
#if 0
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
#define GET_CAUSE(node) causes[hash_id(node)]
#else
@ -1549,7 +1561,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
GGML_ASSERT(sched->splits != NULL);
}
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
split = &sched->splits[i_split];
split->backend_id = node_backend_id;
split->i_start = i;
@ -1865,13 +1876,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
sched->context_buffer = malloc(sched->context_buffer_size);
const int initial_splits_capacity = 16;

View file

@ -9,8 +9,10 @@
#include "ggml-cuda/binbcast.cuh"
#include "ggml-cuda/clamp.cuh"
#include "ggml-cuda/concat.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh"
#include "ggml-cuda/convert.cuh"
#include "ggml-cuda/cpy.cuh"
#include "ggml-cuda/cross-entropy-loss.cuh"
#include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/dmmv.cuh"
#include "ggml-cuda/fattn.cuh"
@ -29,7 +31,6 @@
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh"
#include <algorithm>
#include <array>
@ -2181,6 +2182,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ADD:
ggml_cuda_op_add(ctx, dst);
break;
case GGML_OP_SUB:
ggml_cuda_op_sub(ctx, dst);
break;
case GGML_OP_ACC:
ggml_cuda_op_acc(ctx, dst);
break;
@ -2267,6 +2271,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SQRT:
ggml_cuda_op_sqrt(ctx, dst);
break;
case GGML_OP_SIN:
ggml_cuda_op_sin(ctx, dst);
break;
case GGML_OP_COS:
ggml_cuda_op_cos(ctx, dst);
break;
case GGML_OP_CLAMP:
ggml_cuda_op_clamp(ctx, dst);
break;
@ -2303,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_FLASH_ATTN_EXT:
ggml_cuda_flash_attn_ext(ctx, dst);
break;
case GGML_OP_CROSS_ENTROPY_LOSS:
ggml_cuda_cross_entropy_loss(ctx, dst);
break;
default:
return false;
}
@ -2610,6 +2623,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) {
assert(node->src[j]->buffer);
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
}
}
@ -2853,12 +2867,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
@ -2890,6 +2907,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
}
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
case GGML_OP_CROSS_ENTROPY_LOSS:
return true;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
default:
return false;

View file

@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
return a + b;
}
static __device__ __forceinline__ float op_sub(const float a, const float b) {
return a - b;
}
static __device__ __forceinline__ float op_mul(const float a, const float b) {
return a * b;
}
@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
}
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
}
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
}

View file

@ -2,5 +2,6 @@
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -0,0 +1,106 @@
#include "common.cuh"
#include "cross-entropy-loss.cuh"
#include "sumrows.cuh"
#include <cmath>
#include <cstdint>
static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
const int ne_tmp = WARP_SIZE*nclasses;
extern __shared__ float tmp_all[];
float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
// Each warp first loads ne_tmp logits/labels into shared memory:
for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
const int ig = i0*nclasses + i; // ig == i global
tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
}
// Each thread in the warp then calculates the cross entropy loss for a single row.
// TODO: pad in order to avoid shared memory bank conflicts.
// Find maximum for softmax:
float max = -INFINITY;
for (int i = 0; i < nclasses; ++i) {
max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
}
// Calculate log(softmax(logits)) which is just logits - max:
float sum = 0.0f;
for (int i = 0; i < nclasses; ++i) {
float val = tmp_logits[lane_id*nclasses + i] - max;
sum += expf(val);
tmp_logits[lane_id*nclasses + i] = val;
}
sum = logf(sum);
// log(exp(logits - max) / sum) = (logits - max) - log(sum)
float loss = 0.0f;
for (int i = 0; i < nclasses; ++i) {
loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
}
loss = -warp_reduce_sum(loss) / (float)k;
__syncthreads();
if (lane_id == 0) {
tmp_all[warp_id] = loss;
}
__syncthreads();
if (warp_id != 0) {
return;
}
loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
loss = warp_reduce_sum(loss);
if (lane_id != 0) {
return;
}
dst[blockIdx.x] = loss;
}
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(ggml_is_contiguous(dst));
const int64_t ne00 = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();
const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
// Combine results from individual blocks:
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
}

View file

@ -0,0 +1,5 @@
#include "common.cuh"
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -22,6 +22,7 @@ typedef void (* fattn_kernel_t)(
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int ne00,
const int ne01,
const int ne02,
@ -659,9 +660,15 @@ void launch_fattn(
float scale = 1.0f;
float max_bias = 0.0f;
float logit_softcap = 0.0f;
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
memcpy(&logit_softcap, (float *) KQV->op_params + 2, sizeof(float));
if (logit_softcap != 0.0f) {
scale /= logit_softcap;
}
const uint32_t n_head = Q->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
@ -675,7 +682,7 @@ void launch_fattn(
V_data,
mask ? ((const char *) mask->data) : nullptr,
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale, max_bias, m0, m1, n_head_log2,
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,

View file

@ -4,7 +4,7 @@
#define FATTN_KQ_STRIDE_TILE_F16 64
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f16(
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int ne00,
const int ne01,
const int ne02,
@ -44,6 +45,12 @@ static __global__ void flash_attn_tile_ext_f16(
const int ne2,
const int ne3) {
#ifdef FP16_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
@ -154,7 +161,13 @@ static __global__ void flash_attn_tile_ext_f16(
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
half sum;
if (use_logit_softcap) {
const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
sum = logit_softcap * tanhf(tmp.x + tmp.y);
} else {
sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
}
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
@ -270,20 +283,20 @@ static __global__ void flash_attn_tile_ext_f16(
#endif // FP16_AVAILABLE
}
template <int cols_per_block, int parallel_blocks>
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break;
default: {
@ -296,24 +309,45 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[2];
const int32_t precision = KQV->op_params[3];
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] <= 16) {
constexpr int cols_per_block = 16;
constexpr int parallel_blocks = 4;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 32) {
constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 4;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 1;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
}

View file

@ -4,7 +4,7 @@
#define FATTN_KQ_STRIDE_TILE_F32 32
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f32(
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int ne00,
const int ne01,
const int ne02,
@ -43,6 +44,12 @@ static __global__ void flash_attn_tile_ext_f32(
const int ne1,
const int ne2,
const int ne3) {
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
@ -151,6 +158,10 @@ static __global__ void flash_attn_tile_ext_f32(
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
if (use_logit_softcap) {
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
}
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
@ -267,20 +278,20 @@ static __global__ void flash_attn_tile_ext_f32(
}
}
template <int cols_per_block, int parallel_blocks>
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = 8;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break;
default: {
@ -290,23 +301,45 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
}
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] <= 16) {
constexpr int cols_per_block = 16;
constexpr int parallel_blocks = 4;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 32) {
constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 4;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 32;
constexpr int parallel_blocks = 1;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
}
}

View file

@ -1,7 +1,7 @@
#include "common.cuh"
#include "fattn-common.cuh"
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -17,6 +17,7 @@ static __global__ void flash_attn_vec_ext_f16(
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int ne00,
const int ne01,
const int ne02,
@ -41,6 +42,12 @@ static __global__ void flash_attn_vec_ext_f16(
const int ne2,
const int ne3) {
#ifdef FP16_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
@ -190,6 +197,11 @@ static __global__ void flash_attn_vec_ext_f16(
for (int j = 0; j < ncols; ++j) {
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum(sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
if (ncols == 1) {
@ -286,10 +298,10 @@ static __global__ void flash_attn_vec_ext_f16(
#endif // FP16_AVAILABLE
}
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V>;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = D != 128;
constexpr bool need_f16_V = D != 128 && D != 64;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
@ -297,48 +309,81 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * KQV = dst;
ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2];
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
const int32_t precision = KQV->op_params[2];
const int32_t precision = KQV->op_params[3];
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
}
#define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V) \

View file

@ -1,7 +1,7 @@
#include "common.cuh"
#include "fattn-common.cuh"
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -17,6 +17,7 @@ static __global__ void flash_attn_vec_ext_f32(
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int ne00,
const int ne01,
const int ne02,
@ -40,6 +41,12 @@ static __global__ void flash_attn_vec_ext_f32(
const int ne1,
const int ne2,
const int ne3) {
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
@ -180,6 +187,11 @@ static __global__ void flash_attn_vec_ext_f32(
for (int j = 0; j < ncols; ++j) {
float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
sum = warp_reduce_sum(sum);
if (use_logit_softcap) {
sum = logit_softcap*tanhf(sum);
}
sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
@ -267,10 +279,10 @@ static __global__ void flash_attn_vec_ext_f32(
}
}
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
constexpr int nwarps = D/WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V>;
fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
constexpr bool need_f16_K = D != 128;
constexpr bool need_f16_V = D != 128 && D != 64;
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
@ -278,44 +290,78 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
template <int D, ggml_type type_K, ggml_type type_V>
void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_tensor * Q = dst->src[0];
ggml_tensor * K = dst->src[1];
ggml_tensor * V = dst->src[2];
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
GGML_ASSERT(K->type == type_K);
GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
}
}
#define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V) \

View file

@ -6,7 +6,7 @@
#endif // FP16_MMA_AVAILABLE
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
@ -22,6 +22,7 @@ static __global__ void flash_attn_ext_f16(
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int ne00,
const int ne01,
const int ne02,
@ -46,6 +47,12 @@ static __global__ void flash_attn_ext_f16(
const int ne2,
const int ne3) {
#ifdef FP16_MMA_AVAILABLE
// Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
@ -85,6 +92,8 @@ static __global__ void flash_attn_ext_f16(
const half slopeh = __float2half(slopef);
const half2 slope2 = make_half2(slopef, slopef);
const half2 logit_softcap_2 = make_half2(logit_softcap, logit_softcap);
frag_b Q_b[D/16][ncols/frag_n];
// A single buffer for temporarily holding tiles of KQ and VKQ parts:
@ -194,6 +203,10 @@ static __global__ void flash_attn_ext_f16(
const int k = k0 + threadIdx.x;
KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
if (use_logit_softcap) {
KQ_f_tmp[k0/WARP_SIZE] = logit_softcap*tanhf(KQ_f_tmp[k0/WARP_SIZE]);
}
}
float KQ_max_new = KQ_max_f[j0/nwarps];
@ -237,6 +250,15 @@ static __global__ void flash_attn_ext_f16(
const int k = k0 + threadIdx.x;
KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
if (use_logit_softcap) {
// There is no dedicated tangens hyperbolicus function for half2.
KQ2_tmp[k0/WARP_SIZE] = h2exp(KQ2_tmp[k0/WARP_SIZE]*make_half2(2.0f, 2.0f));
KQ2_tmp[k0/WARP_SIZE] = (KQ2_tmp[k0/WARP_SIZE] - make_half2(1.0f, 1.0f))
/(KQ2_tmp[k0/WARP_SIZE] + make_half2(1.0f, 1.0f));
KQ2_tmp[k0/WARP_SIZE] *= logit_softcap_2;
}
}
half2 KQ_max_new = KQ_max_h2[j0/nwarps];
@ -427,6 +449,7 @@ static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
template <int D, int cols_per_block, typename KQ_acc_t>
void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
constexpr int nwarps = 4;
@ -435,20 +458,50 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (4*blocks_num_pb1 < 2*nsm) {
constexpr int parallel_blocks = 4;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
fattn_kernel_t fattn_kernel;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
} else {
constexpr bool use_logit_softcap = true;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
}
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
return;
}
if (2*blocks_num_pb1 < 2*nsm) {
constexpr int parallel_blocks = 2;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
fattn_kernel_t fattn_kernel;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
} else {
constexpr bool use_logit_softcap = true;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
}
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
return;
}
constexpr int parallel_blocks = 1;
fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
fattn_kernel_t fattn_kernel;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
} else {
constexpr bool use_logit_softcap = true;
fattn_kernel = flash_attn_ext_f16<
D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
}
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
}

View file

@ -13,7 +13,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[2];
const int32_t precision = KQV->op_params[3];
if (precision != GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
@ -301,7 +301,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int32_t precision = KQV->op_params[2];
const int32_t precision = KQV->op_params[3];
// On AMD the tile kernels perform poorly, use the vec kernel instead:
if (cc >= CC_OFFSET_AMD) {

View file

@ -16,7 +16,7 @@ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int nc
}
}
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
@ -32,7 +32,6 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);

View file

@ -1,3 +1,5 @@
#include "common.cuh"
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -101,6 +101,24 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
dst[i] = sqrtf(x[i]);
}
static __global__ void sin_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = sinf(x[i]);
}
static __global__ void cos_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = cosf(x[i]);
}
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -156,6 +174,16 @@ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
@ -312,3 +340,31 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}

View file

@ -9,6 +9,8 @@
#define CUDA_HARDSWISH_BLOCK_SIZE 256
#define CUDA_SQR_BLOCK_SIZE 256
#define CUDA_SQRT_BLOCK_SIZE 256
#define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@ -31,3 +33,7 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -31,6 +31,8 @@ struct ggml_metal_kernel {
enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_ADD,
GGML_METAL_KERNEL_TYPE_ADD_ROW,
GGML_METAL_KERNEL_TYPE_SUB,
GGML_METAL_KERNEL_TYPE_SUB_ROW,
GGML_METAL_KERNEL_TYPE_MUL,
GGML_METAL_KERNEL_TYPE_MUL_ROW,
GGML_METAL_KERNEL_TYPE_DIV,
@ -82,6 +84,8 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_RMS_NORM,
GGML_METAL_KERNEL_TYPE_GROUP_NORM,
GGML_METAL_KERNEL_TYPE_NORM,
GGML_METAL_KERNEL_TYPE_SSM_CONV_F32,
GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
@ -205,6 +209,9 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
GGML_METAL_KERNEL_TYPE_CONCAT,
GGML_METAL_KERNEL_TYPE_SQR,
GGML_METAL_KERNEL_TYPE_SQRT,
GGML_METAL_KERNEL_TYPE_SIN,
GGML_METAL_KERNEL_TYPE_COS,
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
GGML_METAL_KERNEL_TYPE_COUNT
@ -491,6 +498,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
@ -542,6 +551,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction);
@ -665,6 +676,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
}
@ -765,15 +779,20 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
case GGML_OP_PERMUTE:
case GGML_OP_CONCAT:
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_ACC:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_REPEAT:
case GGML_OP_SCALE:
case GGML_OP_CLAMP:
case GGML_OP_SQR:
case GGML_OP_SUM_ROWS:
return true;
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_SUM_ROWS:
case GGML_OP_SOFT_MAX:
case GGML_OP_RMS_NORM:
case GGML_OP_GROUP_NORM:
@ -803,6 +822,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
return false;
}
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
case GGML_OP_SSM_CONV:
case GGML_OP_SSM_SCAN:
return true;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
return ctx->support_simdgroup_reduction &&
@ -1050,6 +1072,7 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
{
@ -1073,6 +1096,7 @@ static enum ggml_status ggml_metal_graph_compute(
nb = ne00 / 4;
switch (dst->op) {
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
default: GGML_ABORT("fatal error");
@ -1082,6 +1106,7 @@ static enum ggml_status ggml_metal_graph_compute(
} else {
switch (dst->op) {
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
default: GGML_ABORT("fatal error");
@ -1409,6 +1434,48 @@ static enum ggml_status ggml_metal_graph_compute(
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SQRT:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SIN:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_COS:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SUM_ROWS:
@ -1538,6 +1605,121 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
}
} break;
case GGML_OP_SSM_CONV:
{
GGML_ASSERT(src0t == GGML_TYPE_F32);
GGML_ASSERT(src1t == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_CONV_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:11];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:12];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:15];
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:16];
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:17];
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:18];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne1, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SSM_SCAN:
{
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
struct ggml_tensor * src4 = gf->nodes[i]->src[4];
struct ggml_tensor * src5 = gf->nodes[i]->src[5];
GGML_ASSERT(src3);
GGML_ASSERT(src4);
GGML_ASSERT(src5);
size_t offs_src3 = 0;
size_t offs_src4 = 0;
size_t offs_src5 = 0;
id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
id<MTLBuffer> id_src4 = src4 ? ggml_metal_get_buffer(src4, &offs_src4) : nil;
id<MTLBuffer> id_src5 = src5 ? ggml_metal_get_buffer(src5, &offs_src5) : nil;
const int64_t ne30 = src3->ne[0]; GGML_UNUSED(ne30);
const int64_t ne31 = src3->ne[1]; GGML_UNUSED(ne31);
const uint64_t nb30 = src3->nb[0];
const uint64_t nb31 = src3->nb[1];
const int64_t ne40 = src4->ne[0]; GGML_UNUSED(ne40);
const int64_t ne41 = src4->ne[1]; GGML_UNUSED(ne41);
const int64_t ne42 = src4->ne[2]; GGML_UNUSED(ne42);
const uint64_t nb40 = src4->nb[0];
const uint64_t nb41 = src4->nb[1];
const uint64_t nb42 = src4->nb[2];
const int64_t ne50 = src5->ne[0]; GGML_UNUSED(ne50);
const int64_t ne51 = src5->ne[1]; GGML_UNUSED(ne51);
const int64_t ne52 = src5->ne[2]; GGML_UNUSED(ne52);
const uint64_t nb50 = src5->nb[0];
const uint64_t nb51 = src5->nb[1];
const uint64_t nb52 = src5->nb[2];
const int64_t d_state = ne00;
const int64_t d_inner = ne01;
const int64_t n_seq_tokens = ne11;
const int64_t n_seqs = ne02;
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
[encoder setBuffer:id_src4 offset:offs_src4 atIndex:4];
[encoder setBuffer:id_src5 offset:offs_src5 atIndex:5];
[encoder setBuffer:id_dst offset:offs_dst atIndex:6];
[encoder setBytes:&d_state length:sizeof(d_state) atIndex:7];
[encoder setBytes:&d_inner length:sizeof(d_inner) atIndex:8];
[encoder setBytes:&n_seq_tokens length:sizeof(n_seq_tokens) atIndex:9];
[encoder setBytes:&n_seqs length:sizeof(n_seqs) atIndex:10];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:11];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:12];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:13];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
[encoder setBytes:&nb20 length:sizeof(nb20) atIndex:18];
[encoder setBytes:&nb21 length:sizeof(nb21) atIndex:19];
[encoder setBytes:&nb22 length:sizeof(nb22) atIndex:20];
[encoder setBytes:&nb30 length:sizeof(nb30) atIndex:21];
[encoder setBytes:&nb31 length:sizeof(nb31) atIndex:22];
[encoder setBytes:&nb40 length:sizeof(nb40) atIndex:23];
[encoder setBytes:&nb41 length:sizeof(nb41) atIndex:24];
[encoder setBytes:&nb42 length:sizeof(nb42) atIndex:25];
[encoder setBytes:&nb50 length:sizeof(nb50) atIndex:26];
[encoder setBytes:&nb51 length:sizeof(nb51) atIndex:27];
[encoder setBytes:&nb52 length:sizeof(nb52) atIndex:28];
[encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_MUL_MAT:
{
GGML_ASSERT(ne00 == ne10);
@ -2624,9 +2806,14 @@ static enum ggml_status ggml_metal_graph_compute(
float scale;
float max_bias;
float logit_softcap;
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
memcpy(&logit_softcap, ((int32_t *) dst->op_params) + 2, sizeof(logit_softcap));
if (logit_softcap != 0.0f) {
scale /= logit_softcap;
}
const uint32_t n_head = src0->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
@ -2701,6 +2888,7 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBytes:&m0 length:sizeof(m0) atIndex:25];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:26];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:27];
[encoder setBytes:&logit_softcap length:sizeof(logit_softcap) atIndex:28];
if (!use_vec_kernel) {
// half8x8 kernel

View file

@ -17,7 +17,7 @@ enum ggml_sort_order {
GGML_SORT_ORDER_DESC,
};
// general-purpose kernel for addition, multiplication and division of two tensors
// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
// pros: works for non-contiguous tensors, supports broadcast across all dims
// cons: not very efficient
kernel void kernel_add(
@ -70,6 +70,56 @@ kernel void kernel_add(
}
}
kernel void kernel_sub(
device const char * src0,
device const char * src1,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int64_t & offs,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig.z;
const int64_t i02 = tgpig.y;
const int64_t i01 = tgpig.x;
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + offs;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
const int i10 = i0 % ne10;
*((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10));
}
}
kernel void kernel_mul(
device const char * src0,
device const char * src1,
@ -226,6 +276,15 @@ kernel void kernel_add_row(
dst[tpig] = src0[tpig] + src1[tpig % nb];
}
kernel void kernel_sub_row(
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
constant uint64_t & nb [[buffer(28)]],
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] - src1[tpig % nb];
}
kernel void kernel_mul_row(
device const float4 * src0,
device const float4 * src1,
@ -358,6 +417,27 @@ kernel void kernel_sqr(
dst[tpig] = src0[tpig] * src0[tpig];
}
kernel void kernel_sqrt(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = sqrt(src0[tpig]);
}
kernel void kernel_sin(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = sin(src0[tpig]);
}
kernel void kernel_cos(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = cos(src0[tpig]);
}
kernel void kernel_sum_rows(
device const float * src0,
device float * dst,
@ -667,6 +747,127 @@ kernel void kernel_diag_mask_inf_8(
}
}
// ref: ggml.c:ggml_compute_forward_ssm_conv_f32
// TODO: optimize
kernel void kernel_ssm_conv_f32(
device const void * src0,
device const void * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant int64_t & ne10,
constant int64_t & ne11,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t ir = tgpig.x;
const int64_t i2 = tgpig.y;
const int64_t i3 = tgpig.z;
const int64_t nc = ne10;
const int64_t ncs = ne00;
const int64_t nr = ne01;
const int64_t n_t = ne1;
const int64_t n_s = ne2;
device const float * s = (device const float *) ((device const char *) src0 + ir*nb01 + i2*nb00 + i3*nb02);
device const float * c = (device const float *) ((device const char *) src1 + ir*nb11);
device float * x = (device float *) ((device char *) dst + ir*nb0 + i2*nb1 + i3*nb2);
float sumf = 0.0f;
for (int64_t i0 = 0; i0 < nc; ++i0) {
sumf += s[i0] * c[i0];
}
x[0] = sumf;
}
// ref: ggml.c:ggml_compute_forward_ssm_scan_f32
// TODO: optimize
kernel void kernel_ssm_scan_f32(
device const void * src0,
device const void * src1,
device const void * src2,
device const void * src3,
device const void * src4,
device const void * src5,
device float * dst,
constant int64_t & d_state,
constant int64_t & d_inner,
constant int64_t & n_seq_tokens,
constant int64_t & n_seqs,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant uint64_t & nb20,
constant uint64_t & nb21,
constant uint64_t & nb22,
constant uint64_t & nb30,
constant uint64_t & nb31,
constant uint64_t & nb40,
constant uint64_t & nb41,
constant uint64_t & nb42,
constant uint64_t & nb50,
constant uint64_t & nb51,
constant uint64_t & nb52,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t ir = tgpig.x;
const int64_t i3 = tgpig.y;
const int64_t nc = d_state;
const int64_t nr = d_inner;
const int64_t n_t = n_seq_tokens;
const int64_t n_s = n_seqs;
for (int64_t i2 = 0; i2 < n_t; ++i2) {
device const float * s0 = (device const float *) ((device const char *) src0 + ir*nb01 + i3*nb02);
device const float * x = (device const float *) ((device const char *) src1 + ir*nb10 + i2*nb11 + i3*nb12);
device const float * dt = (device const float *) ((device const char *) src2 + ir*nb20 + i2*nb21 + i3*nb22);
device const float * A = (device const float *) ((device const char *) src3 + ir*nb31);
device const float * B = (device const float *) ((device const char *) src4 + i2*nb41 + i3*nb42);
device const float * C = (device const float *) ((device const char *) src5 + i2*nb51 + i3*nb52);
device float * y = (device float *) ((device char *) dst + ir*nb10 + i2*nb11 + i3*nb12); // TODO: do not use src1 strides
device float * s = (device float *) ((device char *) dst + ir*nb01 + i3*nb02 + nb13);
if (i2 > 0) {
s0 = s;
}
// i1 == 0
float dt_soft_plus = dt[0] <= 20.0f ? log(1.0f + exp(dt[0])) : dt[0];
float x_dt = x[0] * dt_soft_plus;
float sumf = 0.0f;
for (int64_t i0 = 0; i0 < nc; ++i0) {
int64_t i = i0;
float state = (s0[i] * exp(dt_soft_plus * A[i])) + (B[i0] * x_dt);
sumf += state * C[i0];
s[i] = state;
}
y[0] = sumf;
}
}
kernel void kernel_norm(
device const void * src0,
device float * dst,
@ -1976,6 +2177,7 @@ typedef void (flash_attn_ext_f16_t)(
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
constant float & logit_softcap,
threadgroup half * shared,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
@ -2014,6 +2216,7 @@ kernel void kernel_flash_attn_ext_f16(
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
constant float & logit_softcap,
threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
@ -2138,19 +2341,6 @@ kernel void kernel_flash_attn_ext_f16(
}
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
const short tx = tiisg%4;
const short ty = tiisg/4;
if (mask != q) {
// mqk = mqk*scale + mask*slope
ss[8*cc + ty*TF + 2*tx + 0] = scale*ss[8*cc + ty*TF + 2*tx + 0] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 0];
ss[8*cc + ty*TF + 2*tx + 1] = scale*ss[8*cc + ty*TF + 2*tx + 1] + slope*mp[ic + 8*cc + ty*nb31/sizeof(half) + 2*tx + 1];
} else {
// mqk = mqk*scale
ss[8*cc + ty*TF + 2*tx + 0] *= scale;
ss[8*cc + ty*TF + 2*tx + 1] *= scale;
}
}
}
@ -2162,10 +2352,19 @@ kernel void kernel_flash_attn_ext_f16(
float ms[Q];
for (short j = 0; j < Q; ++j) {
const short p = tiisg;
const float m = M[j];
const float s = ss[j*TF + p];
// scale and apply the logitcap / mask
float s = ss[j*TF + tiisg]*scale;
if (logit_softcap != 0.0f) {
s = logit_softcap*precise::tanh(s);
}
if (mask != q) {
// mqk = mqk + mask*slope
s += slope*mp[ic + j*nb31/sizeof(half) + tiisg];
}
smax = simd_max(max(smax, s));
M[j] = simd_max(max(M[j], s));
@ -2176,7 +2375,7 @@ kernel void kernel_flash_attn_ext_f16(
S[j] = S[j]*ms[j] + simd_sum(vs);
// the P matrix from the paper (Q rows, C columns)
ss[j*TF + p] = vs;
ss[j*TF + tiisg] = vs;
}
// create a QxQ diagonal matrix for rescaling the output
@ -2345,6 +2544,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
constant float & logit_softcap,
threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
@ -2479,7 +2679,13 @@ kernel void kernel_flash_attn_ext_vec_f16(
// mqk = mqk*scale + mask*slope
if (tiisg == 0) {
mqk = mqk*scale + ((mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f);
mqk *= scale;
if (logit_softcap != 0.0f) {
mqk = logit_softcap*precise::tanh(mqk);
}
mqk += (mask != q) ? ((float4) mp4[ic/4 + cc])*slope : (float4) 0.0f;
ss4[cc] = mqk;
}

View file

@ -3644,7 +3644,7 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
quantize_row_q8_K_ref(x, y, k);
}
//===================================== Dot ptoducts =================================
//===================================== Dot products =================================
//
// Helper functions

View file

@ -82,17 +82,18 @@ static_assert(sizeof(rpc_tensor) % 8 == 0, "rpc_tensor size must be multiple of
// RPC commands
enum rpc_cmd {
ALLOC_BUFFER = 0,
GET_ALIGNMENT,
GET_MAX_SIZE,
BUFFER_GET_BASE,
FREE_BUFFER,
BUFFER_CLEAR,
SET_TENSOR,
GET_TENSOR,
COPY_TENSOR,
GRAPH_COMPUTE,
GET_DEVICE_MEMORY,
RPC_CMD_ALLOC_BUFFER = 0,
RPC_CMD_GET_ALIGNMENT,
RPC_CMD_GET_MAX_SIZE,
RPC_CMD_BUFFER_GET_BASE,
RPC_CMD_FREE_BUFFER,
RPC_CMD_BUFFER_CLEAR,
RPC_CMD_SET_TENSOR,
RPC_CMD_GET_TENSOR,
RPC_CMD_COPY_TENSOR,
RPC_CMD_GRAPH_COMPUTE,
RPC_CMD_GET_DEVICE_MEMORY,
RPC_CMD_COUNT,
};
// RPC data structures
@ -330,7 +331,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t
uint64_t remote_ptr = ctx->remote_ptr;
memcpy(input.data(), &remote_ptr, sizeof(remote_ptr));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, FREE_BUFFER, input, output);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, input, output);
GGML_ASSERT(status);
GGML_ASSERT(output.empty());
delete ctx;
@ -346,7 +347,7 @@ GGML_CALL static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t b
uint64_t remote_ptr = ctx->remote_ptr;
memcpy(input.data(), &remote_ptr, sizeof(remote_ptr));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, BUFFER_GET_BASE, input, output);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, input, output);
GGML_ASSERT(status);
GGML_ASSERT(output.size() == sizeof(uint64_t));
// output serialization format: | base_ptr (8 bytes) |
@ -405,7 +406,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t b
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, SET_TENSOR, input, output);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input, output);
GGML_ASSERT(status);
}
@ -419,7 +420,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t b
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &size, sizeof(size));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, GET_TENSOR, input, output);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_GET_TENSOR, input, output);
GGML_ASSERT(status);
GGML_ASSERT(output.size() == size);
// output serialization format: | data (size bytes) |
@ -444,7 +445,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t b
memcpy(input.data(), &rpc_src, sizeof(rpc_src));
memcpy(input.data() + sizeof(rpc_src), &rpc_dst, sizeof(rpc_dst));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, COPY_TENSOR, input, output);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_COPY_TENSOR, input, output);
GGML_ASSERT(status);
// output serialization format: | result (1 byte) |
GGML_ASSERT(output.size() == 1);
@ -459,7 +460,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer
memcpy(input.data(), &ctx->remote_ptr, sizeof(ctx->remote_ptr));
memcpy(input.data() + sizeof(ctx->remote_ptr), &value, sizeof(value));
std::vector<uint8_t> output;
bool status = send_rpc_cmd(ctx->sock, BUFFER_CLEAR, input, output);
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_CLEAR, input, output);
GGML_ASSERT(status);
}
@ -488,7 +489,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
memcpy(input.data(), &size, sizeof(size));
std::vector<uint8_t> output;
auto sock = get_socket(buft_ctx->endpoint);
bool status = send_rpc_cmd(sock, ALLOC_BUFFER, input, output);
bool status = send_rpc_cmd(sock, RPC_CMD_ALLOC_BUFFER, input, output);
GGML_ASSERT(status);
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
// output serialization format: | remote_ptr (8 bytes) | remote_size (8 bytes) |
@ -511,7 +512,7 @@ static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
// input serialization format: | 0 bytes |
std::vector<uint8_t> input;
std::vector<uint8_t> output;
bool status = send_rpc_cmd(sock, GET_ALIGNMENT, input, output);
bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALIGNMENT, input, output);
GGML_ASSERT(status);
GGML_ASSERT(output.size() == sizeof(uint64_t));
// output serialization format: | alignment (8 bytes) |
@ -529,7 +530,7 @@ static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
// input serialization format: | 0 bytes |
std::vector<uint8_t> input;
std::vector<uint8_t> output;
bool status = send_rpc_cmd(sock, GET_MAX_SIZE, input, output);
bool status = send_rpc_cmd(sock, RPC_CMD_GET_MAX_SIZE, input, output);
GGML_ASSERT(status);
GGML_ASSERT(output.size() == sizeof(uint64_t));
// output serialization format: | max_size (8 bytes) |
@ -622,7 +623,7 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
serialize_graph(cgraph, input);
std::vector<uint8_t> output;
auto sock = get_socket(rpc_ctx->endpoint);
bool status = send_rpc_cmd(sock, GRAPH_COMPUTE, input, output);
bool status = send_rpc_cmd(sock, RPC_CMD_GRAPH_COMPUTE, input, output);
GGML_ASSERT(status);
GGML_ASSERT(output.size() == 1);
return (enum ggml_status)output[0];
@ -636,7 +637,7 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const
}
GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
return false;
}
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
@ -678,6 +679,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
}
auto sock = get_socket(endpoint);
if (sock == nullptr) {
fprintf(stderr, "Failed to connect to %s\n", endpoint);
return nullptr;
}
size_t alignment = get_alignment(sock);
@ -719,7 +721,7 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
// input serialization format: | 0 bytes |
std::vector<uint8_t> input;
std::vector<uint8_t> output;
bool status = send_rpc_cmd(sock, GET_DEVICE_MEMORY, input, output);
bool status = send_rpc_cmd(sock, RPC_CMD_GET_DEVICE_MEMORY, input, output);
GGML_ASSERT(status);
GGML_ASSERT(output.size() == 2*sizeof(uint64_t));
// output serialization format: | free (8 bytes) | total (8 bytes) |
@ -1098,59 +1100,69 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
if (!recv_data(sockfd, &cmd, 1)) {
break;
}
if (cmd >= RPC_CMD_COUNT) {
// fail fast if the command is invalid
fprintf(stderr, "Unknown command: %d\n", cmd);
break;
}
std::vector<uint8_t> input;
std::vector<uint8_t> output;
uint64_t input_size;
if (!recv_data(sockfd, &input_size, sizeof(input_size))) {
break;
}
try {
input.resize(input_size);
} catch (const std::bad_alloc & e) {
fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", input_size);
break;
}
if (!recv_data(sockfd, input.data(), input_size)) {
break;
}
bool ok = true;
switch (cmd) {
case ALLOC_BUFFER: {
case RPC_CMD_ALLOC_BUFFER: {
ok = server.alloc_buffer(input, output);
break;
}
case GET_ALIGNMENT: {
case RPC_CMD_GET_ALIGNMENT: {
server.get_alignment(output);
break;
}
case GET_MAX_SIZE: {
case RPC_CMD_GET_MAX_SIZE: {
server.get_max_size(output);
break;
}
case BUFFER_GET_BASE: {
case RPC_CMD_BUFFER_GET_BASE: {
ok = server.buffer_get_base(input, output);
break;
}
case FREE_BUFFER: {
case RPC_CMD_FREE_BUFFER: {
ok = server.free_buffer(input);
break;
}
case BUFFER_CLEAR: {
case RPC_CMD_BUFFER_CLEAR: {
ok = server.buffer_clear(input);
break;
}
case SET_TENSOR: {
case RPC_CMD_SET_TENSOR: {
ok = server.set_tensor(input);
break;
}
case GET_TENSOR: {
case RPC_CMD_GET_TENSOR: {
ok = server.get_tensor(input, output);
break;
}
case COPY_TENSOR: {
case RPC_CMD_COPY_TENSOR: {
ok = server.copy_tensor(input, output);
break;
}
case GRAPH_COMPUTE: {
case RPC_CMD_GRAPH_COMPUTE: {
ok = server.graph_compute(input, output);
break;
}
case GET_DEVICE_MEMORY: {
case RPC_CMD_GET_DEVICE_MEMORY: {
// output serialization format: | free (8 bytes) | total (8 bytes) |
output.resize(2*sizeof(uint64_t), 0);
memcpy(output.data(), &free_mem, sizeof(free_mem));
@ -1203,8 +1215,10 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free
return;
}
printf("Accepted client connection, free_mem=%zu, total_mem=%zu\n", free_mem, total_mem);
fflush(stdout);
rpc_serve_client(backend, client_socket->fd, free_mem, total_mem);
printf("Client connection closed\n");
fflush(stdout);
}
#ifdef _WIN32
WSACleanup();

View file

@ -38,6 +38,7 @@
#include "ggml-sycl/backend.hpp"
#include "ggml-sycl/presets.hpp"
#include "ggml-sycl/gemm.hpp"
bool ggml_sycl_loaded(void);
void ggml_sycl_free_data(struct ggml_tensor * tensor);
@ -893,43 +894,6 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
}
template <typename T>
static void im2col_kernel(const float *x, T *dst, int offset_delta,
int IW, int IH, int OW, int KW, int KH,
int pelements, int CHW, int s0, int s1, int p0,
int p1, int d0, int d1,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_local_id(2) +
item_ct1.get_group(2) * item_ct1.get_local_range(2);
if (i >= pelements) {
return;
}
const int ksize = OW * (KH > 1 ? KW : 1);
const int kx = i / ksize;
const int kd = kx * ksize;
const int ky = (i - kd) / OW;
const int ix = i % OW;
const int64_t iiw = ix * s0 + kx * d0 - p0;
const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;
const int64_t offset_dst =
(item_ct1.get_group(1) * OW + ix) * CHW +
(item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] =
sycl::vec<float, 1>(0.0f)
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
} else {
const int64_t offset_src = item_ct1.get_group(0) * offset_delta;
dst[offset_dst] =
sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
}
}
template <typename Ti, typename To>
static void pool2d_nchw_kernel(
const int ih, const int iw, const int oh, const int ow,
@ -1742,32 +1706,6 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
});
}
template <typename T>
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
int OW, int OH, int KW, int KH, int IC,
int offset_delta, int s0, int s1, int p0,
int p1, int d0, int d1,
queue_ptr stream) {
const int parallel_elements = OW * KW * KH;
const int num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
sycl::range<3> block_nums(IC, OH, num_blocks);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums *
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
[=](sycl::nd_item<3> item_ct1) {
im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
parallel_elements, (IC * KH * KW), s0, s1, p0,
p1, d0, d1, item_ct1);
});
}
}
static bool g_sycl_loaded = false;
bool ggml_sycl_loaded(void) {
@ -2545,6 +2483,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
const sycl::half alpha_f16 = 1.0f;
const sycl::half beta_f16 = 0.0f;
#if !GGML_SYCL_DNNL
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
*stream, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
@ -2554,6 +2493,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
dpct::library_data_t::real_half)));
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
#else
auto dnnl_stream = ctx.stream_dnnl(stream);
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
#endif
}
else {
// GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
@ -2576,13 +2522,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
const float alpha = 1.0f;
const float beta = 0.0f;
#if !GGML_SYCL_DNNL
SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
*stream, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
dpct::get_value(&alpha, *stream), src0_ddf_i, ne00,
src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
dst_dd_i, ldc)));
#else
auto dnnl_stream = ctx.stream_dnnl(stream);
DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
#endif
}
(void) dst;
(void) src1_ddq_i;
@ -2636,47 +2587,6 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
(void) src1_dd;
}
inline void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
float *dst_dd,
const queue_ptr &main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
const int64_t IC = src1->ne[is_2D ? 2 : 1];
const int64_t IH = is_2D ? src1->ne[1] : 1;
const int64_t IW = src1->ne[0];
const int64_t KH = is_2D ? src0->ne[1] : 1;
const int64_t KW = src0->ne[0];
const int64_t OH = is_2D ? dst->ne[2] : 1;
const int64_t OW = dst->ne[1];
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
if (dst->type == GGML_TYPE_F16) {
im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
} else {
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
}
(void) src0;
(void) src0_dd;
}
inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
@ -3581,7 +3491,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE
&& (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda || src1->ne[1] > MMVQ_MIN_BATCH_SIZE);
bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

View file

@ -25,5 +25,6 @@
#include "norm.hpp"
#include "softmax.hpp"
#include "tsembd.hpp"
#include "im2col.hpp"
#endif // GGML_SYCL_BACKEND_HPP

View file

@ -51,3 +51,14 @@ void ggml_sycl_host_free(void* ptr) try {
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) {
const int64_t max_range = std::numeric_limits<int>::max();
int64_t sycl_down_blk_size = block_size;
int64_t global_range = accumulate_block_num * sycl_down_blk_size;
while(global_range > max_range) {
sycl_down_blk_size /= 2;
global_range = accumulate_block_num * sycl_down_blk_size;
}
return sycl_down_blk_size;
}

View file

@ -19,6 +19,10 @@
#include "dpct/helper.hpp"
#include "ggml-sycl.h"
#include "presets.hpp"
#if GGML_SYCL_DNNL
#include "dnnl.hpp"
#include "dnnl_sycl.hpp"
#endif
#define GGML_COMMON_DECL_SYCL
#define GGML_COMMON_IMPL_SYCL
@ -130,6 +134,7 @@ typedef sycl::float2 dfloat2;
#endif // GGML_SYCL_F16
#define MMVQ_MAX_BATCH_SIZE 8
#define MMVQ_MIN_BATCH_SIZE 4
static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
@ -276,6 +281,52 @@ struct ggml_backend_sycl_context {
return stream(device, 0);
}
#if GGML_SYCL_DNNL
dnnl::engine make_engine(sycl::queue* q) {
// Get the device associated with the queue
sycl::device dev = q->get_device();
// Get the context associated with the queue
sycl::context ctx = q->get_context();
const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
return eng;
}
std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
std::unordered_map<sycl::queue*, dnnl::engine> engine_map;
dnnl::stream stream_dnnl(int device, int _stream) {
auto q = stream(device, _stream);
return stream_dnnl(q);
}
dnnl::engine engine_dnnl(sycl::queue* qptr) {
auto it = engine_map.find(qptr);
if (it == engine_map.end()) {
auto eng = make_engine(qptr);
engine_map[qptr] = eng;
return eng;
}
else
{
return it->second;
}
}
dnnl::stream stream_dnnl(sycl::queue* qptr) {
auto it = stream_map.find(qptr);
if (it == stream_map.end()) {
auto eng = engine_dnnl(qptr);
auto stream = dnnl::sycl_interop::make_stream(eng, *qptr);
stream_map[qptr] = stream;
return stream;
}
else
{
return it->second;
}
}
dnnl::stream stream_dnnl() {
return stream_dnnl(device, 0);
}
#endif
// pool
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
@ -352,4 +403,6 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
}
int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
#endif // GGML_SYCL_COMMON_HPP

View file

@ -3,19 +3,19 @@
#include "presets.hpp"
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
const sycl::nd_item<3> &item_ct1) {
const int i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
const int64_t i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2));
if (i >= k) {
return;
}
const int ib = i/qk; // block index
const int iqs = (i%qk)/qr; // quant index
const int iybs = i - i%qk; // y block start index
const int y_offset = qr == 1 ? 1 : qk/2;
const int64_t ib = i/qk; // block index
const int64_t iqs = (i%qk)/qr; // quant index
const int64_t iybs = i - i%qk; // y block start index
const int64_t y_offset = qr == 1 ? 1 : qk/2;
// dequantize
dfloat2 v;
@ -27,9 +27,9 @@ static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static void dequantize_block_sycl(const void *__restrict__ vx,
dst_t *__restrict__ y, const int k,
dst_t *__restrict__ y, const int64_t k,
dpct::queue_ptr stream) {
const int num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE);
const int64_t num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -45,9 +45,9 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
}
template <typename dst_t>
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
#if QK_K == 256
{
dpct::has_capability_or_fail(stream->get_device(),
@ -77,9 +77,9 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
#if QK_K == 256
{
dpct::has_capability_or_fail(stream->get_device(),
@ -108,10 +108,10 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb32 = k / 32;
const int nb = (k + 255) / 256;
const int64_t nb32 = k / 32;
const int64_t nb = (k + 255) / 256;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -126,10 +126,10 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb32 = k / 32;
const int nb = (k + 255) / 256;
const int64_t nb32 = k / 32;
const int64_t nb = (k + 255) / 256;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -145,9 +145,9 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int k,
template <typename dst_t>
static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -165,9 +165,9 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
#if QK_K == 256
{
dpct::has_capability_or_fail(stream->get_device(),
@ -197,9 +197,9 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
#if QK_K == 256
{
dpct::has_capability_or_fail(stream->get_device(),
@ -229,9 +229,9 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -250,9 +250,9 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -271,9 +271,9 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -292,9 +292,9 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -313,9 +313,9 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -333,9 +333,9 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k,
template <typename dst_t>
static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -354,9 +354,9 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = k / QK_K;
const int64_t nb = k / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -374,9 +374,9 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = (k + QK_K - 1) / QK_K;
const int64_t nb = (k + QK_K - 1) / QK_K;
#if QK_K == 64
dequantize_row_iq4_nl_sycl(vx, y, k, stream);
#else
@ -398,9 +398,9 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k,
}
template <typename dst_t>
static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
const int nb = (k + QK_K - 1) / QK_K;
const int64_t nb = (k + QK_K - 1) / QK_K;
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
@ -418,34 +418,34 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k,
}
template <typename src_t, typename dst_t>
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
if (i >= k) {
return;
}
const int64_t work_group_size = item_ct1.get_local_range(2);
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
// make each work-item deal with more elements since sycl global range can not exceed max int
const src_t * x = (src_t *) vx;
for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
y[i] = x[i];
}
}
template <typename src_t, typename dst_t>
static void convert_unary_sycl(const void *__restrict__ vx,
dst_t *__restrict__ y, const int k,
dst_t *__restrict__ y, const int64_t k,
dpct::queue_ptr stream) {
const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
// decrease global range when it exceeds the max int
int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
sycl::range<3> block_nums(1, 1, num_blocks);
sycl::range<3> local_range(1, 1, local_size);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(
sycl::range<3>(1, 1, num_blocks) *
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
sycl::nd_range<3>(block_nums * local_range, local_range),
[=](sycl::nd_item<3> item_ct1) {
convert_unary<src_t>(vx, y, k, item_ct1);
});

View file

@ -17,7 +17,7 @@
template <typename T>
using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
int k, dpct::queue_ptr stream);
int64_t k, dpct::queue_ptr stream);
typedef to_t_sycl_t<float> to_fp32_sycl_t;
typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;

View file

@ -15,9 +15,9 @@
#include "common.hpp"
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
const int iqs, dfloat2 &v) {
const block_q4_0 * x = (const block_q4_0 *) vx;
@ -40,7 +40,7 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
#endif // GGML_SYCL_F16
}
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
const int iqs, dfloat2 &v) {
const block_q4_1 * x = (const block_q4_1 *) vx;
@ -64,7 +64,7 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
#endif // GGML_SYCL_F16
}
static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
static __dpct_inline__ void dequantize_q5_0(const void *vx, const int64_t ib,
const int iqs, dfloat2 &v) {
const block_q5_0 * x = (const block_q5_0 *) vx;
@ -91,7 +91,7 @@ static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
#endif // GGML_SYCL_F16
}
static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib,
const int iqs, dfloat2 &v) {
const block_q5_1 * x = (const block_q5_1 *) vx;
@ -118,7 +118,7 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
#endif // GGML_SYCL_F16
}
static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
static __dpct_inline__ void dequantize_q8_0(const void *vx, const int64_t ib,
const int iqs, dfloat2 &v) {
const block_q8_0 * x = (const block_q8_0 *) vx;
@ -138,16 +138,16 @@ static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
}
template<typename dst_t>
static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32,
static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
// assume 32 threads
const int tid = item_ct1.get_local_id(2);
const int il = tid/8;
const int ir = tid%8;
const int ib = 8*i + ir;
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t ib = 8*i + ir;
if (ib >= nb32) {
return;
}
@ -168,16 +168,16 @@ static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restri
}
template<typename dst_t>
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32,
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
// assume 32 threads
const int tid = item_ct1.get_local_id(2);
const int il = tid/8;
const int ir = tid%8;
const int ib = 8*i + ir;
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t ib = 8*i + ir;
if (ib >= nb32) {
return;
}
@ -203,14 +203,14 @@ template<typename dst_t>
static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_q2_K * x = (const block_q2_K *) vx;
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
#if QK_K == 256
const int n = tid/32;
const int l = tid - 32*n;
const int is = 8*n + l/16;
const int64_t n = tid/32;
const int64_t l = tid - 32*n;
const int64_t is = 8*n + l/16;
const uint8_t q = x[i].qs[32*n + l];
dst_t * y = yy + i*QK_K + 128*n;
@ -222,8 +222,8 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
#else
const int is = tid/16; // 0 or 1
const int il = tid%16; // 0...15
const int64_t is = tid/16; // 0 or 1
const int64_t il = tid%16; // 0...15
const uint8_t q = x[i].qs[il] >> (2*is);
dst_t * y = yy + i*QK_K + 16*is + il;
@ -239,19 +239,19 @@ template<typename dst_t>
static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_q3_K * x = (const block_q3_K *) vx;
#if QK_K == 256
const int r = item_ct1.get_local_id(2) / 4;
const int tid = r/2;
const int is0 = r%2;
const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
const int n = tid / 4;
const int j = tid - 4*n;
const int64_t r = item_ct1.get_local_id(2) / 4;
const int64_t tid = r/2;
const int64_t is0 = r%2;
const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
const int64_t n = tid / 4;
const int64_t j = tid - 4*n;
uint8_t m = 1 << (4*n + j);
int is = 8*n + 2*j + is0;
int64_t is = 8*n + 2*j + is0;
int shift = 2*j;
int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
@ -267,11 +267,11 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
#else
const int tid = item_ct1.get_local_id(2);
const int is = tid/16; // 0 or 1
const int il = tid%16; // 0...15
const int im = il/8; // 0...1
const int in = il%8; // 0...7
const int64_t tid = item_ct1.get_local_id(2);
const int64_t is = tid/16; // 0 or 1
const int64_t il = tid%16; // 0...15
const int64_t im = il/8; // 0...1
const int64_t in = il%8; // 0...7
dst_t * y = yy + i*QK_K + 16*is + il;
@ -307,15 +307,15 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
const block_q4_K * x = (const block_q4_K *) vx;
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
#if QK_K == 256
// assume 32 threads
const int tid = item_ct1.get_local_id(2);
const int il = tid/8;
const int ir = tid%8;
const int is = 2*il;
const int n = 4;
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/8;
const int64_t ir = tid%8;
const int64_t is = 2*il;
const int64_t n = 4;
dst_t * y = yy + i*QK_K + 64*il + n*ir;
@ -341,7 +341,7 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
y[l +32] = d2 * (q_vec[l] >> 4) - m2;
}
#else
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
const uint8_t * q = x[i].qs;
dst_t * y = yy + i*QK_K;
const float d = (float)x[i].dm[0];
@ -356,14 +356,14 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
const sycl::nd_item<3> &item_ct1) {
const block_q5_K * x = (const block_q5_K *) vx;
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
#if QK_K == 256
// assume 64 threads - this is very slightly better than the one below
const int tid = item_ct1.get_local_id(2);
const int il = tid/16; // il is in 0...3
const int ir = tid%16; // ir is in 0...15
const int is = 2*il; // is is in 0...6
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/16; // il is in 0...3
const int64_t ir = tid%16; // ir is in 0...15
const int64_t is = 2*il; // is is in 0...6
dst_t * y = yy + i*QK_K + 64*il + 2*ir;
@ -386,11 +386,11 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
#else
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
const uint8_t q = x[i].qs[tid];
const int im = tid/8; // 0...3
const int in = tid%8; // 0...7
const int is = tid/16; // 0 or 1
const int64_t im = tid/8; // 0...3
const int64_t in = tid%8; // 0...7
const int64_t is = tid/16; // 0 or 1
const uint8_t h = x[i].qh[in] >> im;
const float d = x[i].d;
dst_t * y = yy + i*QK_K + tid;
@ -404,14 +404,14 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
const sycl::nd_item<3> &item_ct1) {
const block_q6_K * x = (const block_q6_K *) vx;
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
#if QK_K == 256
// assume 64 threads - this is very slightly better than the one below
const int tid = item_ct1.get_local_id(2);
const int ip = tid/32; // ip is 0 or 1
const int il = tid - 32*ip; // 0...32
const int is = 8*ip + il/16;
const int64_t tid = item_ct1.get_local_id(2);
const int64_t ip = tid/32; // ip is 0 or 1
const int64_t il = tid - 32*ip; // 0...32
const int64_t is = 8*ip + il/16;
dst_t * y = yy + i*QK_K + 128*ip + il;
@ -428,9 +428,9 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
#else
// assume 32 threads
const int tid = item_ct1.get_local_id(2);
const int ip = tid/16; // 0 or 1
const int il = tid - 16*ip; // 0...15
const int64_t tid = item_ct1.get_local_id(2);
const int64_t ip = tid/16; // 0 or 1
const int64_t il = tid - 16*ip; // 0...15
dst_t * y = yy + i*QK_K + 16*ip + il;
@ -452,13 +452,13 @@ static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __res
const uint8_t *ksigns_iq2xs_ptr,
const uint8_t *kmask_iq2xs_ptr) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * q2 = x[i].qs + 4*ib;
const uint8_t * aux8 = (const uint8_t *)q2;
@ -480,13 +480,13 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest
const uint8_t *ksigns_iq2xs,
const uint8_t *kmask_iq2xs) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq2_xs * x = (const block_iq2_xs *) vx;
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * q2 = x[i].qs + 4*ib;
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
@ -504,13 +504,13 @@ __dpct_inline__ static void
dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq2_s * x = (const block_iq2_s *) vx;
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
@ -532,13 +532,13 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res
const uint8_t *ksigns_iq2xs,
const uint8_t *kmask_iq2xs) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * q3 = x[i].qs + 8*ib;
const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
@ -563,13 +563,13 @@ dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
const sycl::nd_item<3> &item_ct1,
const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq3_s * x = (const block_iq3_s *) vx;
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * qs = x[i].qs + 8*ib;
const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
@ -593,13 +593,13 @@ dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy,
const sycl::nd_item<3> &item_ct1,
const uint32_t *iq1s_grid_gpu) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq1_s * x = (const block_iq1_s *) vx;
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
@ -623,13 +623,13 @@ dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy,
const sycl::nd_item<3> &item_ct1,
const uint32_t *iq1s_grid_gpu) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq1_m * x = (const block_iq1_m *) vx;
const int tid = item_ct1.get_local_id(2);
const int64_t tid = item_ct1.get_local_id(2);
#if QK_K == 256
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint16_t * sc = (const uint16_t *)x[i].scales;
iq1m_scale_t scale;
@ -656,12 +656,12 @@ __dpct_inline__ static void
dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
const int tid = item_ct1.get_local_id(2);
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[ib].qs + 4*il;
const float d = (float)x[ib].d;
@ -678,12 +678,12 @@ template <typename dst_t>
__dpct_inline__ static void
dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy,
const sycl::nd_item<3> &item_ct1) {
const int i = item_ct1.get_group(2);
const int64_t i = item_ct1.get_group(2);
const block_iq4_xs * x = (const block_iq4_xs *)vx;
const int tid = item_ct1.get_local_id(2);
const int il = tid/8; // 0...3
const int ib = tid%8; // 0...7
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid/8; // 0...3
const int64_t ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);

View file

@ -4,7 +4,7 @@
#include "presets.hpp"
static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const sycl::half *x = (const sycl::half *)vx;
// automatic half -> float type cast if dfloat == float
@ -12,7 +12,7 @@ static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 &
v.y() = x[ib + iqs + 1];
}
static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
const float * x = (const float *) vx;
// automatic half -> float type cast if dfloat == float

101
ggml/src/ggml-sycl/gemm.hpp Normal file
View file

@ -0,0 +1,101 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
#ifndef GGML_SYCL_GEMM_HPP
#define GGML_SYCL_GEMM_HPP
#include <fstream>
#include <iostream>
#include "ggml-sycl.h"
#if GGML_SYCL_DNNL
#include "dnnl.hpp"
#include "dnnl_sycl.hpp"
class DnnlGemmWrapper {
public:
using dt = dnnl::memory::data_type;
using tag = dnnl::memory::format_tag;
template<typename T>
static constexpr dt to_dt() {
if constexpr (std::is_same_v<T, float>) return dt::f32;
else if constexpr (std::is_same_v<T, sycl::half>) return dt::f16;
else static_assert(0);
}
static inline void row_gemm(sycl::queue& q, bool a_trans,
bool b_trans, int m, int n, int k,
const void* a, dt at, const void* b, dt bt, void* c, dt ct)
{
// Get the device associated with the queue
sycl::device dev = q.get_device();
// Get the context associated with the queue
sycl::context ctx = q.get_context();
const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
const dnnl::stream stream = dnnl::sycl_interop::make_stream(eng, q);
dnnl::memory::dims a_dims = { m, k };
dnnl::memory::dims b_dims = { k, n };
dnnl::memory::dims c_dims = { m, n };
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
// Create the primitive.
auto matmul_prim = dnnl::matmul(matmul_pd);
// Primitive arguments.
std::unordered_map<int, dnnl::memory> matmul_args;
matmul_args.insert({ DNNL_ARG_SRC, a_mem });
matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
matmul_args.insert({ DNNL_ARG_DST, c_mem });
matmul_prim.execute(stream, matmul_args);
}
static inline void row_gemm(const dnnl::stream& stream, bool a_trans,
bool b_trans, int m, int n, int k,
const void* a, dt at, const void* b, dt bt, void* c, dt ct)
{
auto const eng = stream.get_engine();
dnnl::memory::dims a_dims = { m, k };
dnnl::memory::dims b_dims = { k, n };
dnnl::memory::dims c_dims = { m, n };
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
auto a_mem = dnnl::memory(a_in_md, eng, (void*)a);
auto b_mem = dnnl::memory(b_in_md, eng, (void*)b);
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
// Create the primitive.
auto matmul_prim = dnnl::matmul(matmul_pd);
// Primitive arguments.
std::unordered_map<int, dnnl::memory> matmul_args;
matmul_args.insert({ DNNL_ARG_SRC, a_mem });
matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
matmul_args.insert({ DNNL_ARG_DST, c_mem });
matmul_prim.execute(stream, matmul_args);
}
};
#endif
#endif // GGML_SYCL_GEMM_HPP

View file

@ -0,0 +1,125 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
#include "im2col.hpp"
template <typename T>
static void im2col_kernel(
const float *x, T *dst, int64_t batch_offset, int64_t offset_delta,
int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
int64_t pelements, int64_t CHW, int s0, int s1, int p0, int p1, int d0, int d1,
const sycl::nd_item<3> &item_ct1) {
const int64_t work_group_size = item_ct1.get_local_range(2);
const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
// make each work-item deal with more elements since sycl global range can not exceed max int
for (int64_t i = global_id; i < pelements; i += work_group_size * item_ct1.get_group_range(2)) {
const int64_t ksize = OW * (KH > 1 ? KW : 1);
const int64_t kx = i / ksize;
const int64_t kd = kx * ksize;
const int64_t ky = (i - kd) / OW;
const int64_t ix = i % OW;
const int64_t oh = item_ct1.get_group(1);
const int64_t batch = item_ct1.get_group(0) / IC;
const int64_t ic = item_ct1.get_group(0) % IC;
const int64_t iiw = ix * s0 + kx * d0 - p0;
const int64_t iih = oh * s1 + ky * d1 - p1;
const int64_t offset_dst =
((batch * OH + oh) * OW + ix) * CHW +
(ic * (KW * KH) + ky * KW + kx);
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] =
sycl::vec<float, 1>(0.0f)
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
} else {
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
dst[offset_dst] =
sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
.convert<sycl::half, sycl::rounding_mode::automatic>()[0];
}
}
}
template <typename T>
static void im2col_sycl(
const float *x, T *dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW,
int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta,
int s0, int s1, int p0, int p1, int d0, int d1,
queue_ptr stream) {
const int64_t parallel_elements = OW * KW * KH;
const int64_t num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
// decrease global range when it exceeds the max int
int64_t local_size = downsample_sycl_global_range(batch * IC * OH * num_blocks, SYCL_IM2COL_BLOCK_SIZE);
sycl::range<3> block_nums(batch * IC, OH, num_blocks);
sycl::range<3> local_range(1, 1, local_size);
{
dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});
stream->parallel_for(
sycl::nd_range<3>(block_nums * local_range, local_range),
[=](sycl::nd_item<3> item_ct1) {
im2col_kernel(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH,
parallel_elements, (IC * KH * KW), s0, s1, p0,
p1, d0, d1, item_ct1);
});
}
}
void ggml_sycl_op_im2col(
ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd,
const queue_ptr &main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
const int64_t IC = src1->ne[is_2D ? 2 : 1];
const int64_t IH = is_2D ? src1->ne[1] : 1;
const int64_t IW = src1->ne[0];
const int64_t KH = is_2D ? src0->ne[1] : 1;
const int64_t KW = src0->ne[0];
const int64_t OH = is_2D ? dst->ne[2] : 1;
const int64_t OW = dst->ne[1];
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
const int64_t batch = src1->ne[3];
const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
if (dst->type == GGML_TYPE_F16) {
im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
} else {
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
}
(void) src0;
(void) src0_dd;
}

View file

@ -0,0 +1,23 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
#ifndef GGML_SYCL_IM2COL_HPP
#define GGML_SYCL_IM2COL_HPP
#include "common.hpp"
void ggml_sycl_op_im2col(
ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd,
const queue_ptr &main_stream);
#endif // GGML_SYCL_IM2COL_HPP

View file

@ -180,6 +180,7 @@ struct vk_device_struct {
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
vk_pipeline pipeline_acc_f32;
vk_pipeline pipeline_add_f32, pipeline_add_f16_f32_f16;
vk_pipeline pipeline_mul_f32;
vk_pipeline pipeline_div_f32;
@ -187,6 +188,8 @@ struct vk_device_struct {
vk_pipeline pipeline_upscale_f32;
vk_pipeline pipeline_scale_f32;
vk_pipeline pipeline_sqr_f32;
vk_pipeline pipeline_sin_f32;
vk_pipeline pipeline_cos_f32;
vk_pipeline pipeline_clamp_f32;
vk_pipeline pipeline_pad_f32;
vk_pipeline pipeline_repeat_f32;
@ -1687,6 +1690,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_div_f32, "div_f32", div_f32_len, div_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
@ -1699,6 +1704,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@ -3971,6 +3978,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_get_rows_f32[src0->type];
}
return nullptr;
case GGML_OP_ACC:
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_acc_f32;
}
return nullptr;
case GGML_OP_ADD:
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_add_f32;
@ -4015,6 +4027,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_sqr_f32;
}
return nullptr;
case GGML_OP_SIN:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_sin_f32;
}
return nullptr;
case GGML_OP_COS:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_cos_f32;
}
return nullptr;
case GGML_OP_CLAMP:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_clamp_f32;
@ -4163,6 +4185,8 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_REPEAT:
@ -4373,6 +4397,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
case GGML_OP_MUL:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_REPEAT:
@ -4463,6 +4489,28 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
}, dryrun);
}
static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
int offset = dst->op_params[3] / 4; // offset in bytes
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t) dst->nb[3] / dst_type_size,
d_offset,
0.0f, 0.0f, offset,
}, dryrun);
}
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t src1_type_size = ggml_type_size(src1->type);
@ -4568,6 +4616,32 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
}, dryrun);
}
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
float * op_params = (float *)dst->op_params;
const uint32_t src0_type_size = ggml_type_size(src0->type);
@ -5621,12 +5695,15 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_REPEAT:
case GGML_OP_GET_ROWS:
case GGML_OP_ADD:
case GGML_OP_ACC:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_CONCAT:
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_CPY:
@ -5668,6 +5745,10 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_REPEAT:
ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun);
break;
case GGML_OP_ACC:
ggml_vk_acc(ctx, compute_ctx, src0, src1, node, dryrun);
break;
case GGML_OP_GET_ROWS:
ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node, dryrun);
@ -5700,6 +5781,14 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_SQR:
ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
break;
case GGML_OP_SIN:
ggml_vk_sin(ctx, compute_ctx, src0, node);
break;
case GGML_OP_COS:
ggml_vk_cos(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CLAMP:
ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
@ -5808,6 +5897,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
switch (tensor->op) {
case GGML_OP_ADD:
case GGML_OP_ACC:
case GGML_OP_GET_ROWS:
case GGML_OP_MUL:
case GGML_OP_DIV:
@ -5815,6 +5905,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_CPY:
@ -6539,12 +6631,15 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_GROUP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_ADD:
case GGML_OP_ACC:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_CONCAT:
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_CONT:
@ -6987,6 +7082,10 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
} else if (tensor->op == GGML_OP_SQR) {
tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_SIN) {
tensor_clone = ggml_sin(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_COS) {
tensor_clone = ggml_cos(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_CLAMP) {
tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
} else if (tensor->op == GGML_OP_PAD) {
@ -6995,6 +7094,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
tensor_clone = ggml_repeat(ggml_ctx, src0_clone, src1_clone);
} else if (tensor->op == GGML_OP_ADD) {
tensor_clone = ggml_add(ggml_ctx, src0_clone, src1_clone);
} else if (tensor->op == GGML_OP_ACC) {
tensor_clone = ggml_acc(ggml_ctx, src0_clone, src1_clone, tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
} else if (tensor->op == GGML_OP_NORM) {
tensor_clone = ggml_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params);
} else if (tensor->op == GGML_OP_GROUP_NORM) {

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,24 @@
#version 450
#include "types.comp"
#include "generic_binary_head.comp"
void main() {
const uint idx = gl_GlobalInvocationID.x;
if (idx >= p.ne) {
return;
}
const uint offset = p.param3;
const uint src1_i = idx - offset;
const uint oz = src1_i / p.nb02;
const uint oy = (src1_i - (oz * p.nb02)) / p.nb01;
const uint ox = src1_i % p.nb01;
if (ox < p.ne10 && oy < p.ne11 && oz < p.ne12) {
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[ox + oy * p.ne10 + oz * p.ne10 * p.ne11]));
} else {
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]));
}
}

View file

@ -0,0 +1,15 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
}

View file

@ -0,0 +1,15 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
}

View file

@ -368,6 +368,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
}));
@ -392,6 +396,14 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));

View file

@ -130,6 +130,7 @@ class Keys:
INNER_SIZE = "{arch}.ssm.inner_size"
STATE_SIZE = "{arch}.ssm.state_size"
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
class Tokenizer:
MODEL = "tokenizer.ggml.model"
@ -219,6 +220,8 @@ class MODEL_ARCH(IntEnum):
T5 = auto()
T5ENCODER = auto()
JAIS = auto()
NEMOTRON = auto()
EXAONE = auto()
class MODEL_TENSOR(IntEnum):
@ -347,6 +350,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.T5: "t5",
MODEL_ARCH.T5ENCODER: "t5encoder",
MODEL_ARCH.JAIS: "jais",
MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.EXAONE: "exaone",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -1065,6 +1070,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.NEMOTRON: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.EXAONE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
# TODO
}
@ -1105,6 +1141,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_ARCH.CHATGLM: [
MODEL_TENSOR.ROPE_FREQS,
],
MODEL_ARCH.NEMOTRON: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
}
#
@ -1333,6 +1373,7 @@ KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
# tokenization
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL

View file

@ -730,6 +730,9 @@ class GGUFWriter:
def add_ssm_time_step_rank(self, value: int) -> None:
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
def add_tokenizer_model(self, model: str) -> None:
self.add_string(Keys.Tokenizer.MODEL, model)

Some files were not shown because too many files have changed in this diff Show more