Merge branch 'master' of https://github.com/JoanFM/llama.cpp into feat-jina-embeddings-v2-zh

This commit is contained in:
Joan Martinez 2024-07-26 09:18:18 +02:00
commit 201559d177
201 changed files with 24709 additions and 157137 deletions

View file

@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all

View file

@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1

View file

@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
@ -25,7 +25,7 @@ ENV GGML_CUDA=1
RUN make -j$(nproc) llama-cli
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libgomp1

View file

@ -1,6 +1,6 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
@ -14,10 +14,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
echo "Building with static libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --config Release --target llama-cli
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
COPY --from=build /app/build/bin/llama-cli /llama-cli

View file

@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget libgomp1

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git
@ -11,7 +11,7 @@ COPY . .
RUN make -j$(nproc) llama-cli
FROM ubuntu:$UBUNTU_VERSION as runtime
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libgomp1

View file

@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
# Target the CUDA runtime image
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_CUDA_DEV_CONTAINER} as build
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all
@ -27,7 +27,7 @@ ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1 curl

View file

@ -1,6 +1,6 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
ARG GGML_SYCL_F16=OFF
RUN apt-get update && \
@ -14,10 +14,11 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
echo "GGML_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
fi && \
echo "Building with dynamic libs" && \
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev curl

View file

@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget

View file

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
FROM ubuntu:$UBUNTU_VERSION AS build
RUN apt-get update && \
apt-get install -y build-essential git libcurl4-openssl-dev curl
@ -13,7 +13,7 @@ ENV LLAMA_CURL=1
RUN make -j$(nproc) llama-server
FROM ubuntu:$UBUNTU_VERSION as runtime
FROM ubuntu:$UBUNTU_VERSION AS runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev libgomp1

View file

@ -10,7 +10,6 @@
"llama-embedding"
"llama-server"
"llama-quantize"
"llama-train-text-from-scratch"
];
mkApp = name: {
type = "app";

View file

@ -18,6 +18,7 @@
vulkan-headers,
vulkan-loader,
curl,
shaderc,
useBlas ? builtins.all (x: !x) [
useCuda
useMetalKit
@ -146,6 +147,7 @@ let
vulkanBuildInputs = [
vulkan-headers
vulkan-loader
shaderc
];
in

View file

@ -8,13 +8,11 @@ arg1="$1"
shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert-hf-to-gguf.py "$@"
python3 ./convert_hf_to_gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
./llama-cli "$@"
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
./llama-finetune "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -36,8 +34,6 @@ else
echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml"
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
echo " See documentation for finetune for command-line parameters"
echo " --all-in-one (-a): Execute --convert & --quantize"
echo " ex: \"/models/\" 7B"
echo " --server (-s): Run a model on the server"

4
.github/labeler.yml vendored
View file

@ -16,7 +16,9 @@ SYCL:
- any-glob-to-any-file:
- ggml/include/ggml-sycl.h
- ggml/src/ggml-sycl.cpp
- README-sycl.md
- ggml/src/ggml-sycl/**
- docs/backend/SYCL.md
- examples/sycl/**
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:

View file

@ -355,8 +355,10 @@ jobs:
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential libvulkan-dev
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
sudo apt-get update -y
sudo apt-get install -y build-essential vulkan-sdk
- name: Build
id: cmake_build
@ -858,7 +860,7 @@ jobs:
mkdir build
cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
- name: Determine tag name
id: tag

5
.gitignore vendored
View file

@ -61,6 +61,11 @@ llama-batched-swift
out/
tmp/
# Deprecated
/main
/server
# CI
!.github/workflows/*.yml

View file

@ -50,9 +50,6 @@ endif()
# option list
#
# general
option(LLAMA_CCACHE "llama: use ccache if available" ON)
# debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
@ -77,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
# override ggml options
set(GGML_CCACHE ${LLAMA_CCACHE})
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
@ -110,12 +106,16 @@ llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
llama_option_depr(WARNING LLAMA_RPC GGML_RPC)
llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
#
# build the library
#
if (NOT TARGET ggml)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
add_subdirectory(src)
#
@ -133,7 +133,16 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
# At the moment some compile definitions are placed within the ggml/src
# directory but not exported on the `ggml` target. This could be improved by
# determining _precisely_ which defines are necessary for the llama-config
# package.
#
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
install(TARGETS llama LIBRARY PUBLIC_HEADER)

View file

@ -1,13 +1,17 @@
# Pull requests
# Pull requests (for contributors)
- Always squash-merge the PR before merging
- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Test your changes:
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
- Execute [the full CI locally on your machine](ci/README.md) before publishing
- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
# Pull requests (for collaborators)
- Squash-merge PRs
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
# Coding guidelines

189
Makefile
View file

@ -11,7 +11,6 @@ BUILD_TARGETS = \
llama-embedding \
llama-eval-callback \
llama-export-lora \
llama-finetune \
llama-gbnf-validator \
llama-gguf \
llama-gguf-hash \
@ -37,7 +36,6 @@ BUILD_TARGETS = \
llama-simple \
llama-speculative \
llama-tokenize \
llama-train-text-from-scratch \
llama-vdot \
llama-cvector-generator \
tests/test-c.o
@ -64,9 +62,13 @@ TEST_TARGETS = \
tests/test-tokenizer-1-spm
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
# Deprecation aliases
ifdef LLAMA_CUBLAS
@ -193,7 +195,11 @@ ifdef GGML_RPC
BUILD_TARGETS += rpc-server
endif
default: $(BUILD_TARGETS)
ifdef GGML_VULKAN
BUILD_TARGETS += vulkan-shaders-gen
endif
default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
test: $(TEST_TARGETS)
@failures=0; \
@ -228,7 +234,7 @@ test: $(TEST_TARGETS)
fi
@echo 'All tests passed.'
all: $(BUILD_TARGETS) $(TEST_TARGETS)
all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc
@ -245,17 +251,22 @@ MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11
ifndef LLAMA_NO_CCACHE
ifdef LLAMA_NO_CCACHE
GGML_NO_CCACHE := 1
DEPRECATE_WARNING := 1
endif
ifndef GGML_NO_CCACHE
CCACHE := $(shell which ccache)
ifdef CCACHE
export CCACHE_SLOPPINESS = time_macros
$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
CC := $(CCACHE) $(CC)
CXX := $(CCACHE) $(CXX)
else
$(info I ccache not found. Consider installing it for faster compilation.)
endif # CCACHE
endif # LLAMA_NO_CCACHE
endif # GGML_NO_CCACHE
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@ -538,14 +549,20 @@ ifdef GGML_OPENBLAS64
endif # GGML_OPENBLAS64
ifdef GGML_BLIS
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
MK_LDFLAGS += -lblis -L/usr/local/lib
OBJ_GGML += ggml/src/ggml-blas.o
endif # GGML_BLIS
ifdef GGML_NVPL
MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
OBJ_GGML += ggml/src/ggml-blas.o
endif # GGML_NVPL
ifndef GGML_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJ_GGML += ggml/src/sgemm.o
OBJ_GGML += ggml/src/llamafile/sgemm.o
endif
ifdef GGML_RPC
@ -695,8 +712,8 @@ endif # GGML_CUDA
ifdef GGML_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
OBJ_GGML += ggml/src/ggml-vulkan.o
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
ifdef GGML_VULKAN_CHECK_RESULTS
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
@ -718,10 +735,28 @@ ifdef GGML_VULKAN_RUN_TESTS
MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
endif
ggml/src/ggml-vulkan.o: \
ggml/src/ggml-vulkan.cpp \
ggml/include/ggml-vulkan.h
$(CXX) $(CXXFLAGS) -c $< -o $@
GLSLC_CMD = glslc
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
_ggml_vk_input_dir = ggml/src/vulkan-shaders
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
$(_ggml_vk_header): $(_ggml_vk_source)
$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
$(_ggml_vk_genshaders_cmd) \
--glslc $(GLSLC_CMD) \
--input-dir $(_ggml_vk_input_dir) \
--target-hpp $(_ggml_vk_header) \
--target-cpp $(_ggml_vk_source)
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
endif # GGML_VULKAN
ifdef GGML_HIPBLAS
@ -758,6 +793,14 @@ ifdef GGML_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # GGML_CUDA_FORCE_DMMV
ifdef GGML_CUDA_FORCE_MMQ
HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # GGML_CUDA_FORCE_MMQ
ifdef GGML_CUDA_FORCE_CUBLAS
HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
endif # GGML_CUDA_FORCE_CUBLAS
ifdef GGML_CUDA_NO_PEER_COPY
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
endif # GGML_CUDA_NO_PEER_COPY
@ -826,10 +869,14 @@ OBJ_GGML += \
ggml/src/ggml.o \
ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \
ggml/src/ggml-quants.o
ggml/src/ggml-quants.o \
ggml/src/ggml-aarch64.o
OBJ_LLAMA = \
src/llama.o \
src/llama-vocab.o \
src/llama-grammar.o \
src/llama-sampling.o \
src/unicode.o \
src/unicode-data.o
@ -926,6 +973,7 @@ $(info - LLAMA_NO_LLAMAFILE)
$(info - LLAMA_NO_ACCELERATE)
$(info - LLAMA_NO_OPENMP)
$(info - LLAMA_NO_METAL)
$(info - LLAMA_NO_CCACHE)
$(info )
endif
@ -959,15 +1007,22 @@ ggml/src/ggml-quants.o: \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-aarch64.o: \
ggml/src/ggml-aarch64.c \
ggml/include/ggml.h \
ggml/src/ggml-aarch64.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-blas.o: \
ggml/src/ggml-blas.cpp \
ggml/include/ggml-blas.h
$(CXX) $(CXXFLAGS) -c $< -o $@
ifndef GGML_NO_LLAMAFILE
ggml/src/sgemm.o: \
ggml/src/sgemm.cpp \
ggml/src/sgemm.h \
ggml/src/llamafile/sgemm.o: \
ggml/src/llamafile/sgemm.cpp \
ggml/src/llamafile/sgemm.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE
@ -1001,6 +1056,10 @@ src/unicode-data.o: \
src/llama.o: \
src/llama.cpp \
src/llama-impl.h \
src/llama-vocab.h \
src/llama-grammar.h \
src/llama-sampling.h \
src/unicode.h \
include/llama.h \
ggml/include/ggml-cuda.h \
@ -1010,6 +1069,29 @@ src/llama.o: \
ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-vocab.o: \
src/llama-vocab.cpp \
src/llama-vocab.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-grammar.o: \
src/llama-grammar.cpp \
src/llama-grammar.h \
src/llama-impl.h \
src/llama-vocab.h \
src/llama-sampling.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
src/llama-sampling.o: \
src/llama-sampling.cpp \
src/llama-sampling.h \
src/llama-impl.h \
include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(LIB_LLAMA): \
$(OBJ_LLAMA) \
$(LIB_GGML)
@ -1092,7 +1174,8 @@ clean:
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -rvf $(LEGACY_TARGETS)
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
rm -rvf $(LEGACY_TARGETS_CLEAN)
find examples pocs -type f -name "*.o" -delete
#
@ -1211,11 +1294,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
$(OBJ_GGML) $(OBJ_LLAMA)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1231,13 +1309,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-finetune: examples/finetune/finetune.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
llama-export-lora: examples/export-lora/export-lora.cpp \
$(OBJ_GGML) common/log.h
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1384,7 +1457,7 @@ run-benchmark-matmult: llama-benchmark-matmult
.PHONY: run-benchmark-matmult swift
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -1488,3 +1561,51 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
#
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
#
# Mark legacy binary targets as .PHONY so that they are always checked.
.PHONY: main quantize perplexity embedding server
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
# Eventually we will want to remove these target from building all the time.
main: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
server: examples/deprecation-warning/deprecation-warning.cpp
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
quantize: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard quantize))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
@echo " Remove the 'quantize' binary to remove this warning."
@echo "#########"
endif
perplexity: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard perplexity))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
@echo " Remove the 'perplexity' binary to remove this warning."
@echo "#########"
endif
embedding: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard embedding))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
@echo " Remove the 'embedding' binary to remove this warning."
@echo "#########"
endif

View file

@ -4,12 +4,16 @@ import PackageDescription
var sources = [
"src/llama.cpp",
"src/llama-vocab.cpp",
"src/llama-grammar.cpp",
"src/llama-sampling.cpp",
"src/unicode.cpp",
"src/unicode-data.cpp",
"ggml/src/ggml.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-aarch64.c",
]
var resources: [Resource] = []

View file

@ -3,7 +3,7 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -98,7 +98,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
**Multimodal models:**
@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well.
Unless otherwise noted these projects are open-source with permissive licensing:
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
- [iohub/collama](https://github.com/iohub/coLLaMA)
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
- [nat/openplayground](https://github.com/nat/openplayground)
@ -181,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
**Games:**
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
## Demo
<details>
@ -453,7 +457,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
- [How to build](./docs/build.md)
- [Running on Docker](./docs/docker.md)
- [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
**Seminal papers and background on the models**

View file

@ -8,6 +8,13 @@ set(GGML_CUDA @GGML_CUDA@)
set(GGML_METAL @GGML_METAL@)
set(GGML_HIPBLAS @GGML_HIPBLAS@)
set(GGML_ACCELERATE @GGML_ACCELERATE@)
set(GGML_VULKAN @GGML_VULKAN@)
set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
set(GGML_SYCL @GGML_SYCL@)
set(GGML_OPENMP @GGML_OPENMP@)
@PACKAGE_INIT@
@ -37,18 +44,36 @@ if (GGML_METAL)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
endif()
if (GGML_VULKAN)
find_package(Vulkan REQUIRED)
endif()
if (GGML_HIPBLAS)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
endif()
if (GGML_SYCL)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
endif()
if (GGML_OPENMP)
find_package(OpenMP REQUIRED)
endif()
find_library(ggml_LIBRARY ggml
REQUIRED
HINTS ${LLAMA_LIB_DIR})
find_library(llama_LIBRARY llama
REQUIRED
HINTS ${LLAMA_LIB_DIR})
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
add_library(llama UNKNOWN IMPORTED)

View file

@ -1,3 +1,7 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "common.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
@ -681,7 +685,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
if (arg == "--lora") {
CHECK_ARG
params.lora_adapter.emplace_back(argv[i], 1.0f);
params.use_mmap = false;
return true;
}
if (arg == "--lora-scaled") {
@ -689,12 +692,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
const char* lora_adapter = argv[i];
CHECK_ARG
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
params.use_mmap = false;
return true;
}
if (arg == "--lora-base") {
CHECK_ARG
params.lora_base = argv[i];
return true;
}
if (arg == "--control-vector") {
@ -793,6 +790,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cont_batching = true;
return true;
}
if (arg == "-nocb" || arg == "--no-cont-batching") {
params.cont_batching = false;
return true;
}
if (arg == "-fa" || arg == "--flash-attn") {
params.flash_attn = true;
return true;
@ -1268,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
CHECK_ARG
params.out_file = argv[i];
params.cvector_outfile = argv[i];
params.lora_outfile = argv[i];
return true;
}
if (arg == "-ofreq" || arg == "--output-frequency") {
@ -1534,6 +1536,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" });
options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
@ -1576,9 +1579,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
"advanced option to override model metadata by key. may be specified multiple times.\n"
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
"note: this argument can be repeated to add multiple control vectors" });
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
@ -1669,6 +1671,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
options.push_back({ "export-lora" });
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
for (const auto & o : options) {
@ -2080,19 +2089,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
int err = llama_model_apply_lora_from_file(model,
lora_adapter.c_str(),
lora_scale,
((i > 0) || params.lora_base.empty())
? NULL
: params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
if (adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
llama_lora_adapter_set(lctx, adapter, lora_scale);
}
if (params.ignore_eos) {
@ -2719,7 +2723,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
const llama_chat_msg & new_msg,
bool add_ass) {
std::ostringstream ss;
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@ -3164,7 +3168,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

View file

@ -128,7 +128,6 @@ struct gpt_params {
// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
std::string lora_base = ""; // base model path for the lora adapter
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -255,6 +254,8 @@ struct gpt_params {
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
bool spm_infill = false; // suffix/prefix/middle pattern for infill
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
};
void gpt_params_handle_hf_token(gpt_params & params);

View file

@ -37,11 +37,18 @@ struct llama_ngram {
}
};
struct llama_token_hash_function {
size_t operator()(const llama_token token) const {
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
return token * 11400714819323198485llu;
}
};
struct llama_ngram_hash_function {
size_t operator()(const llama_ngram & ngram) const {
size_t hash = 0;
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
}
return hash;
}

View file

@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
// Apply grammar constraints to the single token
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
@ -378,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
*original_logits = {logits, logits + n_vocab};
}
// apply params.logit_bias map
@ -391,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
cur.clear();
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
// apply grammar checks before sampling logic
if (apply_grammar && ctx_sampling->grammar != NULL) {
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
}
return cur_p;
@ -455,6 +455,6 @@ void llama_sampling_accept(
ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
}
}

File diff suppressed because it is too large Load diff

View file

@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
@ -91,6 +91,9 @@ models = [
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", },
]
@ -100,8 +103,8 @@ def download_file_with_auth(url, token, save_path):
response = sess.get(url, headers=headers)
response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, 'wb') as f:
f.write(response.content)
with open(save_path, 'wb') as downloaded_file:
downloaded_file.write(response.content)
logger.info(f"File {save_path} downloaded successfully")
@ -160,7 +163,7 @@ for model in models:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(chktxt)
chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.info(f"model: {name}")
@ -192,7 +195,7 @@ src_func = f"""
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
chktxt = {repr(chktxt)}
chktxt = {repr(CHK_TXT)}
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
@ -288,7 +291,7 @@ tests = [
"333333333",
"Cửa Việt", # llama-bpe fails on this
" discards",
chktxt,
CHK_TXT,
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp

View file

@ -132,6 +132,10 @@ class Tensor:
class GGMLModel:
file_format: GGMLFormat
format_version: int
def __init__(self):
self.hyperparameters = None
self.vocab = None
@ -290,7 +294,7 @@ class GGMLToGGUF:
if self.vocab_override is not None:
vo = self.vocab_override
logger.info('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes)
scores.append(score)
toktypes.append(ttype)

393
convert_lora_to_gguf.py Executable file
View file

@ -0,0 +1,393 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
from dataclasses import dataclass
import logging
import argparse
import os
import sys
import json
from math import prod
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
import torch
if TYPE_CHECKING:
from torch import Tensor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
# reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, Model
logger = logging.getLogger("lora-to-gguf")
@dataclass
class PartialLoraTensor:
A: Tensor | None = None
B: Tensor | None = None
# magic to support tensor shape modifications and splitting
class LoraTorchTensor:
_lora_A: Tensor # (n_rank, row_size)
_lora_B: Tensor # (col_size, n_rank)
_rank: int
def __init__(self, A: Tensor, B: Tensor):
assert len(A.shape) == len(B.shape)
assert A.shape[-2] == B.shape[-1]
if A.dtype != B.dtype:
A = A.to(torch.float32)
B = B.to(torch.float32)
self._lora_A = A
self._lora_B = B
self._rank = B.shape[-1]
def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
return (self._lora_A, self._lora_B)
def __getitem__(
self,
indices: (
SupportsIndex
| slice
| tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature
),
) -> LoraTorchTensor:
shape = self.shape
if isinstance(indices, SupportsIndex):
if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
else:
raise NotImplementedError # can't return a vector
elif isinstance(indices, slice):
if len(shape) > 2:
return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
else:
return LoraTorchTensor(self._lora_A, self._lora_B[indices])
elif isinstance(indices, tuple):
assert len(indices) > 0
if indices[-1] is Ellipsis:
return self[indices[:-1]]
# expand ellipsis
indices = tuple(
u
for v in (
(
(slice(None, None) for _ in range(len(indices) - 1))
if i is Ellipsis
else (i,)
)
for i in indices
)
for u in v
)
if len(indices) < len(shape):
indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape))))
# TODO: make sure this is correct
indices_A = (
*(
(
j.__index__() % self._lora_A.shape[i]
if isinstance(j, SupportsIndex)
else slice(None, None)
)
for i, j in enumerate(indices[:-2])
),
slice(None, None),
indices[-1],
)
indices_B = indices[:-1]
return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
else:
raise NotImplementedError # unknown indice type
@property
def dtype(self) -> torch.dtype:
assert self._lora_A.dtype == self._lora_B.dtype
return self._lora_A.dtype
@property
def shape(self) -> tuple[int, ...]:
assert len(self._lora_A.shape) == len(self._lora_B.shape)
return (*self._lora_B.shape[:-1], self._lora_A.shape[-1])
def size(self, dim=None):
assert dim is None
return self.shape
def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
if isinstance(shape[0], tuple):
new_shape: tuple[int, ...] = shape[0]
else:
new_shape = cast(tuple[int, ...], shape)
orig_shape = self.shape
if len(new_shape) < 2:
raise NotImplementedError # can't become a vector
# expand -1 in the shape
if any(dim == -1 for dim in new_shape):
n_elems = prod(orig_shape)
n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
assert n_elems % n_new_elems == 0
new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),)
if new_shape[-1] != orig_shape[-1]:
raise NotImplementedError # can't reshape the row size trivially
shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
shape_B = (*new_shape[:-1], self._rank)
return LoraTorchTensor(
self._lora_A.reshape(shape_A),
self._lora_B.reshape(shape_B),
)
def reshape_as(self, other: Tensor) -> LoraTorchTensor:
return self.reshape(*other.shape)
def view(self, *size: int) -> LoraTorchTensor:
return self.reshape(*size)
def permute(self, *dims: int) -> LoraTorchTensor:
shape = self.shape
dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
if dims[-1] == -1:
# TODO: support higher dimensional A shapes bigger than 1
assert all(dim == 1 for dim in self._lora_A.shape[:-2])
return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims))
else:
# TODO: compose the above two
raise NotImplementedError
def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
shape = self.shape
dims = [i for i in range(len(shape))]
dims[dim0], dims[dim1] = dims[dim1], dims[dim0]
return self.permute(*dims)
def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor:
return self.transpose(axis0, axis1)
def to(self, *args, **kwargs):
return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs))
@classmethod
def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
del types # unused
if kwargs is None:
kwargs = {}
if func is torch.permute:
return type(args[0]).permute(*args, **kwargs)
elif func is torch.reshape:
return type(args[0]).reshape(*args, **kwargs)
elif func is torch.stack:
assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0)
assert dim == 0
return LoraTorchTensor(
torch.stack([a._lora_A for a in args[0]], dim),
torch.stack([b._lora_B for b in args[0]], dim),
)
elif func is torch.cat:
assert isinstance(args[0], Sequence)
dim = kwargs.get("dim", 0)
assert dim == 0
if len(args[0][0].shape) > 2:
return LoraTorchTensor(
torch.cat([a._lora_A for a in args[0]], dim),
torch.cat([b._lora_B for b in args[0]], dim),
)
elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]):
return LoraTorchTensor(
args[0][0]._lora_A,
torch.cat([b._lora_B for b in args[0]], dim),
)
else:
raise NotImplementedError
else:
raise NotImplementedError
def get_base_tensor_name(lora_tensor_name: str) -> str:
base_name = lora_tensor_name.replace("base_model.model.", "")
base_name = base_name.replace(".lora_A.weight", ".weight")
base_name = base_name.replace(".lora_B.weight", ".weight")
return base_name
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file")
parser.add_argument(
"--outfile", type=Path,
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
)
parser.add_argument(
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
)
parser.add_argument(
"--bigendian", action="store_true",
help="model is executed on big endian machine",
)
parser.add_argument(
"--no-lazy", action="store_true",
help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
)
parser.add_argument(
"--verbose", action="store_true",
help="increase output verbosity",
)
parser.add_argument(
"--dry-run", action="store_true",
help="only print out what will be done, without writing any new files",
)
parser.add_argument(
"--base", type=Path, required=True,
help="directory containing base model file",
)
parser.add_argument(
"lora_path", type=Path,
help="directory containing LoRA adapter file",
)
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
ftype_map: dict[str, gguf.LlamaFileType] = {
"f32": gguf.LlamaFileType.ALL_F32,
"f16": gguf.LlamaFileType.MOSTLY_F16,
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
"auto": gguf.LlamaFileType.GUESSED,
}
ftype = ftype_map[args.outtype]
dir_base_model: Path = args.base
dir_lora: Path = args.lora_path
lora_config = dir_lora / "adapter_config.json"
input_model = dir_lora / "adapter_model.safetensors"
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_lora
if os.path.exists(input_model):
# lazy import load_file only if lora is in safetensors format.
from safetensors.torch import load_file
lora_model = load_file(input_model, device="cpu")
else:
input_model = os.path.join(dir_lora, "adapter_model.bin")
lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
# load base model
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = Model.load_hparams(dir_base_model)
with torch.inference_mode():
try:
model_class = Model.from_model_architecture(hparams["architectures"][0])
except NotImplementedError:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
class LoraModel(model_class):
model_arch = model_class.model_arch
lora_alpha: float
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
super().__init__(*args, **kwargs)
self.dir_model_card = dir_lora_model
self.lora_alpha = float(lora_alpha)
def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
def set_gguf_parameters(self):
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
super().set_gguf_parameters()
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_map: dict[str, PartialLoraTensor] = {}
for name, tensor in lora_model.items():
if self.lazy:
tensor = LazyTorchTensor.from_eager(tensor)
base_name = get_base_tensor_name(name)
is_lora_a = ".lora_A.weight" in name
is_lora_b = ".lora_B.weight" in name
if not is_lora_a and not is_lora_b:
if ".base_layer.weight" in name:
continue
logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
sys.exit(1)
if base_name in tensor_map:
if is_lora_a:
tensor_map[base_name].A = tensor
else:
tensor_map[base_name].B = tensor
else:
if is_lora_a:
tensor_map[base_name] = PartialLoraTensor(A=tensor)
else:
tensor_map[base_name] = PartialLoraTensor(B=tensor)
for name, tensor in tensor_map.items():
assert tensor.A is not None
assert tensor.B is not None
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
dest = super().modify_tensors(data_torch, name, bid)
for dest_name, dest_data in dest:
assert isinstance(dest_data, LoraTorchTensor)
lora_a, lora_b = dest_data.get_lora_A_B()
yield (dest_name + ".lora_a", lora_a)
yield (dest_name + ".lora_b", lora_b)
with open(lora_config, "r") as f:
lparams: dict[str, Any] = json.load(f)
alpha: float = lparams["lora_alpha"]
model_instance = LoraModel(
dir_base_model,
ftype,
fname_out,
is_big_endian=args.bigendian,
use_temp_file=False,
eager=args.no_lazy,
dry_run=args.dry_run,
dir_lora_model=dir_lora,
lora_alpha=alpha,
)
logger.info("Exporting model...")
model_instance.write()
logger.info(f"Model successfully exported to {model_instance.fname_out}")

View file

@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
```sh
./build/bin/llama-ls-sycl-device
```
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
```
found 6 SYCL devices:
found 2 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
| Attribute | Note |
|------------------------|-------------------------------------------------------------|
| compute capability 1.3 | Level-zero driver/runtime, recommended |
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
4. Launch inference
There are two device selection modes:
- Single device: Use one device target specified by the user.
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
- Multiple devices: Automatically choose the devices with the same backend.
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
| Device selection | Parameter |
|------------------|----------------------------------------|
@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
build\bin\ls-sycl-device.exe
```
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
```
found 6 SYCL devices:
found 2 SYCL devices:
| | | |Compute |Max compute|Max work|Max sub| |
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
```
| Attribute | Note |
|------------------------|-----------------------------------------------------------|
| compute capability 1.3 | Level-zero running time, recommended |
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
4. Launch inference
There are two device selection modes:
- Single device: Use one device assigned by user.
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
- Single device: Use one device assigned by user. Default device id is 0.
- Multiple devices: Automatically choose the devices with the same backend.
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
| Device selection | Parameter |
|------------------|----------------------------------------|

View file

@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
make
```
- On Windows:
- On Windows (x86/x64 only, arm64 requires cmake):
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
2. Extract `w64devkit` on your pc.
@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options.
```
- Notes:
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options.
**Notes**:
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:
@ -58,6 +60,17 @@ In order to build llama.cpp you have four different options.
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
- Tab Workload: Desktop-development with C++
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
- For Windows on ARM (arm64, WoA) build with:
```bash
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
cmake --build build-arm64-windows-llvm-release
```
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
- Using `gmake` (FreeBSD):
@ -240,6 +253,45 @@ The following compilation options are also available to tweak performance (yes,
### Vulkan
**Windows**
#### w64devkit
Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
```sh
SDK_VERSION=1.3.283.0
cp /VulkanSDK/$SDK_VERSION/Bin/glslc.exe $W64DEVKIT_HOME/bin/
cp /VulkanSDK/$SDK_VERSION/Lib/vulkan-1.lib $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/
cp -r /VulkanSDK/$SDK_VERSION/Include/* $W64DEVKIT_HOME/x86_64-w64-mingw32/include/
cat > $W64DEVKIT_HOME/x86_64-w64-mingw32/lib/pkgconfig/vulkan.pc <<EOF
Name: Vulkan-Loader
Description: Vulkan Loader
Version: $SDK_VERSION
Libs: -lvulkan-1
EOF
```
Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
#### MSYS2
Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
```sh
pacman -S git \
mingw-w64-ucrt-x86_64-gcc \
mingw-w64-ucrt-x86_64-cmake \
mingw-w64-ucrt-x86_64-vulkan-devel \
mingw-w64-ucrt-x86_64-shaderc
```
Switch into `llama.cpp` directory and build using CMake.
```sh
cmake -B build -DGGML_VULKAN=ON
cmake --build build --config Release
```
**With docker**:
You don't need to install Vulkan SDK. It will be installed inside the container.

View file

@ -9,15 +9,15 @@ Adding a model requires few steps:
After following these steps, you can open PR.
Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
- [main](../examples/main)
- [imatrix](../examples/imatrix)
- [quantize](../examples/quantize)
- [server](../examples/server)
- [main](/examples/main/)
- [imatrix](/examples/imatrix/)
- [quantize](/examples/quantize/)
- [server](/examples/server/)
### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
@ -31,7 +31,7 @@ class MyModel(Model):
model_arch = gguf.MODEL_ARCH.GROK
```
2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py)
Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
@ -54,7 +54,7 @@ Example for `falcon` model:
As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file.
If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback).
Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/).
## GGUF specification

View file

@ -1,7 +1,7 @@
# Token generation performance troubleshooting
## Verifying that the model is running on the GPU with CUDA
Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
```shell
./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
```

View file

@ -21,7 +21,6 @@ else()
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(finetune)
add_subdirectory(gbnf-validator)
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
@ -53,5 +52,4 @@ else()
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(tokenize)
add_subdirectory(train-text-from-scratch)
endif()

View file

@ -31,7 +31,7 @@ int main(int argc, char ** argv) {
int n_parallel = params.n_parallel;
// total length of the sequences including the prompt
int n_predict = 32;
int n_predict = params.n_predict;
// init LLM

View file

@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar
import numpy as np
@ -346,42 +346,6 @@ class Params:
return params
@dataclass
class Metadata:
name: Optional[str] = None
author: Optional[str] = None
version: Optional[str] = None
url: Optional[str] = None
description: Optional[str] = None
license: Optional[str] = None
source_url: Optional[str] = None
source_hf_repo: Optional[str] = None
@staticmethod
def load(metadata_path: Path) -> Metadata:
if metadata_path is None or not metadata_path.exists():
return Metadata()
with open(metadata_path, 'r') as file:
data = json.load(file)
# Create a new Metadata instance
metadata = Metadata()
# Assigning values to Metadata attributes if they exist in the JSON file
# This is based on LLM_KV_NAMES mapping in llama.cpp
metadata.name = data.get("general.name")
metadata.author = data.get("general.author")
metadata.version = data.get("general.version")
metadata.url = data.get("general.url")
metadata.description = data.get("general.description")
metadata.license = data.get("general.license")
metadata.source_url = data.get("general.source.url")
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
return metadata
#
# data loading
# TODO: reuse (probably move to gguf.py?)
@ -806,7 +770,7 @@ class OutputFile:
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_model(self, params: Params, metadata: Metadata | None) -> None:
def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None:
# Metadata About The Model And Its Provenence
name = "LLaMA"
if metadata is not None and metadata.name is not None:
@ -824,16 +788,73 @@ class OutputFile:
self.gguf.add_author(metadata.author)
if metadata.version is not None:
self.gguf.add_version(metadata.version)
if metadata.url is not None:
self.gguf.add_url(metadata.url)
if metadata.organization is not None:
self.gguf.add_organization(metadata.organization)
if metadata.finetune is not None:
self.gguf.add_finetune(metadata.finetune)
if metadata.basename is not None:
self.gguf.add_basename(metadata.basename)
if metadata.description is not None:
self.gguf.add_description(metadata.description)
if metadata.quantized_by is not None:
self.gguf.add_quantized_by(metadata.quantized_by)
if metadata.size_label is not None:
self.gguf.add_size_label(metadata.size_label)
if metadata.license is not None:
self.gguf.add_licence(metadata.license)
self.gguf.add_license(metadata.license)
if metadata.license_name is not None:
self.gguf.add_license_name(metadata.license_name)
if metadata.license_link is not None:
self.gguf.add_license_link(metadata.license_link)
if metadata.url is not None:
self.gguf.add_url(metadata.url)
if metadata.doi is not None:
self.gguf.add_doi(metadata.doi)
if metadata.uuid is not None:
self.gguf.add_uuid(metadata.uuid)
if metadata.repo_url is not None:
self.gguf.add_repo_url(metadata.repo_url)
if metadata.source_url is not None:
self.gguf.add_source_url(metadata.source_url)
if metadata.source_hf_repo is not None:
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
if metadata.source_doi is not None:
self.gguf.add_source_doi(metadata.source_doi)
if metadata.source_uuid is not None:
self.gguf.add_source_uuid(metadata.source_uuid)
if metadata.source_repo_url is not None:
self.gguf.add_source_repo_url(metadata.source_repo_url)
if metadata.base_models is not None:
self.gguf.add_base_model_count(len(metadata.base_models))
for key, base_model_entry in enumerate(metadata.base_models):
if "name" in base_model_entry:
self.gguf.add_base_model_name(key, base_model_entry["name"])
if "author" in base_model_entry:
self.gguf.add_base_model_author(key, base_model_entry["author"])
if "version" in base_model_entry:
self.gguf.add_base_model_version(key, base_model_entry["version"])
if "organization" in base_model_entry:
self.gguf.add_base_model_organization(key, base_model_entry["organization"])
if "url" in base_model_entry:
self.gguf.add_base_model_url(key, base_model_entry["url"])
if "doi" in base_model_entry:
self.gguf.add_base_model_doi(key, base_model_entry["doi"])
if "uuid" in base_model_entry:
self.gguf.add_base_model_uuid(key, base_model_entry["uuid"])
if "repo_url" in base_model_entry:
self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
if metadata.tags is not None:
self.gguf.add_tags(metadata.tags)
if metadata.languages is not None:
self.gguf.add_languages(metadata.languages)
if metadata.datasets is not None:
self.gguf.add_datasets(metadata.datasets)
def add_meta_arch(self, params: Params) -> None:
# Metadata About The Neural Architecture Itself
@ -944,7 +965,7 @@ class OutputFile:
@staticmethod
def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -978,7 +999,7 @@ class OutputFile:
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
metadata: Metadata | None = None,
metadata: gguf.Metadata | None = None,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1021,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
raise ValueError(f"Unexpected combination of types: {name_to_type}")
def model_parameter_count(model: LazyModel) -> int:
total_model_parameters = 0
for i, (name, lazy_tensor) in enumerate(model.items()):
sum_weights_in_tensor = 1
def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]:
total_params = 0
shared_params = 0
expert_params = 0
for name, lazy_tensor in tensors:
# We don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
continue
# Got A Tensor
sum_weights_in_tensor: int = 1
# Tensor Volume
for dim in lazy_tensor.shape:
sum_weights_in_tensor *= dim
total_model_parameters += sum_weights_in_tensor
return total_model_parameters
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
if model_params_count > 1e12 :
# Trillions Of Parameters
scaled_model_params = model_params_count * 1e-12
scale_suffix = "T"
elif model_params_count > 1e9 :
# Billions Of Parameters
scaled_model_params = model_params_count * 1e-9
scale_suffix = "B"
elif model_params_count > 1e6 :
# Millions Of Parameters
scaled_model_params = model_params_count * 1e-6
scale_suffix = "M"
if ".experts." in name:
if ".experts.0." in name:
expert_params += sum_weights_in_tensor
else:
# Thousands Of Parameters
scaled_model_params = model_params_count * 1e-3
scale_suffix = "K"
shared_params += sum_weights_in_tensor
return f"{round(scaled_model_params)}{scale_suffix}"
total_params += sum_weights_in_tensor
return total_params, shared_params, expert_params
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
@ -1231,34 +1249,24 @@ class VocabFactory:
return vocab, special_vocab
def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
quantization = {
def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str:
name = metadata.name if metadata.name is not None else None
basename = metadata.basename if metadata.basename is not None else None
finetune = metadata.finetune if metadata.finetune is not None else None
version = metadata.version if metadata.version is not None else None
size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0)
output_type = {
GGMLFileType.AllF32: "F32",
GGMLFileType.MostlyF16: "F16",
GGMLFileType.MostlyQ8_0: "Q8_0",
}[file_type]
parameters = model_parameter_count_rounded_notation(model_params_count)
expert_count = ""
if params.n_experts is not None:
expert_count = f"{params.n_experts}x"
version = ""
if metadata is not None and metadata.version is not None:
version = f"-{metadata.version}"
name = "ggml-model"
if metadata is not None and metadata.name is not None:
name = metadata.name
elif params.path_model is not None:
name = params.path_model.name
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
return gguf.naming_convention(name, basename, finetune, version, size_label, output_type)
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path:
default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata)
ret = model_paths[0].parent / f"{default_filename}.gguf"
if ret in model_paths:
logger.error(
@ -1297,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file")
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
parser.add_argument("--model-name", type=str, default=None, help="name of the model")
args = parser.parse_args(args_in)
@ -1310,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None:
else:
logging.basicConfig(level=logging.INFO)
metadata = Metadata.load(args.metadata)
model_name = args.model_name
dir_model = args.model
metadata = gguf.Metadata.load(args.metadata, dir_model, model_name)
if args.get_outfile:
model_plus = load_some_model(args.model)
model_plus = load_some_model(dir_model)
params = Params.load(model_plus)
model = convert_model_names(model_plus.model, params, args.skip_unknown)
model_params_count = model_parameter_count(model_plus.model)
model_params_count = per_model_weight_count_estimation(model_plus.model.items())
ftype = pick_output_type(model, args.outtype)
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
if (metadata is None or metadata.name is None) and params.path_model is not None:
metadata.name = params.path_model.name
print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100
return
if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab")
if args.dump_single:
model_plus = lazy_load_file(args.model)
model_plus = lazy_load_file(dir_model)
do_dump_model(model_plus)
return
if not args.vocab_only:
model_plus = load_some_model(args.model)
model_plus = load_some_model(dir_model)
else:
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
model_params_count = model_parameter_count(model_plus.model)
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None)
if args.dump:
do_dump_model(model_plus)
@ -1368,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None:
logger.info(f"params = {params}")
model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
vocab_path = Path(args.vocab_dir or dir_model or model_parent_path)
vocab_factory = VocabFactory(vocab_path)
vocab_types = None if args.no_vocab else args.vocab_type.split(",")
vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
@ -1399,13 +1412,21 @@ def main(args_in: list[str] | None = None) -> None:
assert params is not None
if metadata.name is None and params.path_model is not None:
metadata.name = params.path_model.name
model_params_count = per_model_weight_count_estimation(model_plus.model.items())
logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})")
logger.info(f"Vocab info: {vocab}")
logger.info(f"Special vocab info: {special_vocab}")
model = model_plus.model
model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata)
metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0)
params.ftype = ftype
logger.info(f"Writing {outfile}, format {ftype}")

View file

@ -0,0 +1,49 @@
# Migration notice for binary filenames
> [!IMPORTANT]
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
This migration was important, but it is a breaking change that may not always be immediately obvious to users.
Please update all scripts and workflows to use the new binary names.
| Old Filename | New Filename |
| ---- | ---- |
| main | llama-cli |
| server | llama-server |
| llama-bench | llama-bench |
| embedding | llama-embedding |
| quantize | llama-quantize |
| tokenize | llama-tokenize |
| export-lora | llama-export-lora |
| libllava.a | libllava.a |
| baby-llama | llama-baby-llama |
| batched | llama-batched |
| batched-bench | llama-batched-bench |
| benchmark-matmult | llama-benchmark-matmult |
| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
| eval-callback | llama-eval-callback |
| gbnf-validator | llama-gbnf-validator |
| gguf | llama-gguf |
| gguf-split | llama-gguf-split |
| gritlm | llama-gritlm |
| imatrix | llama-imatrix |
| infill | llama-infill |
| llava-cli | llama-llava-cli |
| lookahead | llama-lookahead |
| lookup | llama-lookup |
| lookup-create | llama-lookup-create |
| lookup-merge | llama-lookup-merge |
| lookup-stats | llama-lookup-stats |
| parallel | llama-parallel |
| passkey | llama-passkey |
| perplexity | llama-perplexity |
| q8dot | llama-q8dot |
| quantize-stats | llama-quantize-stats |
| retrieval | llama-retrieval |
| save-load-state | llama-save-load-state |
| simple | llama-simple |
| speculative | llama-speculative |
| vdot | llama-vdot |
| tests/test-c.o | tests/test-c.o |

View file

@ -0,0 +1,35 @@
// Warns users that this filename was deprecated, and provides a link for more information.
#include <cstdio>
#include <string>
#include <unordered_map>
// Main
int main(int argc, char** argv) {
std::string filename = "main";
if (argc >= 1) {
filename = argv[0];
}
// Get only the program name from the full path
auto pos = filename.find_last_of('/');
if (pos != std::string::npos) {
filename = filename.substr(pos+1);
}
// Append "llama-" to the beginning of filename to get the replacemnt filename
auto replacement_filename = "llama-" + filename;
// The exception is if the filename is "main", then our replacement filename is "llama-cli"
if (filename == "main") {
replacement_filename = "llama-cli";
}
fprintf(stdout, "\n");
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
fprintf(stdout, "\n");
return EXIT_FAILURE;
}

View file

@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
char src1_str[128] = {0};
if (src1) {
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
}
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,

View file

@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model.
usage: llama-export-lora [options]
options:
-h, --help show this help message and exit
-m FNAME, --model-base FNAME model path from which to load base model (default '')
-o FNAME, --model-out FNAME path to save exported model (default '')
-l FNAME, --lora FNAME apply LoRA adapter
-s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S
-t N, --threads N number of threads to use during computation (default: 4)
-m, --model model path from which to load base model (default '')
--lora FNAME path to LoRA adapter (can be repeated to use multiple adapters)
--lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)
-t, --threads N number of threads to use during computation (default: 4)
-o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf')
```
For example:
@ -20,7 +19,15 @@ For example:
./bin/llama-export-lora \
-m open-llama-3b-v2-q8_0.gguf \
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
```
Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
```bash
./bin/llama-export-lora \
-m your_base_model.gguf \
-o your_merged_model.gguf \
--lora-scaled lora_task_A.gguf 0.5 \
--lora-scaled lora_task_B.gguf 0.5
```

View file

@ -1,462 +1,420 @@
#include "common.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include <map>
#include <vector>
#include <string>
#include <thread>
#include <fstream>
struct lora_info {
std::string filename;
static bool g_verbose = false;
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
}
static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
}
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
file.write(&zero, 1);
}
}
static std::string ggml_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}
static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ ctx_ggml,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
if (!ctx_gguf) {
throw std::runtime_error("failed to load input GGUF from " + fname);
}
return ctx_gguf;
}
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
std::string result;
for (size_t pos = 0; ; pos += search.length()) {
auto new_pos = s.find(search, pos);
if (new_pos == std::string::npos) {
result += s.substr(pos, s.size() - pos);
break;
}
result += s.substr(pos, new_pos - pos) + replace;
pos = new_pos;
}
s = std::move(result);
}
struct file_input {
struct ggml_context * ctx_meta = nullptr;
struct gguf_context * ctx_gguf = nullptr;
std::ifstream f_in;
std::map<std::string, ggml_tensor *> tensors;
float alpha;
float scale;
file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
if (!f_in.is_open()) {
throw std::runtime_error("failed to open input gguf from " + fname);
}
ctx_gguf = load_gguf(fname, &ctx_meta);
alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
std::string name(cur->name);
tensors[name] = cur;
if (g_verbose) {
printf("%s: %s\n", __func__, cur->name);
}
}
}
ggml_tensor * get_tensor(std::string name) {
if (tensors.find(name) == tensors.end()) {
return nullptr;
}
return tensors[name];
}
void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
if (tensors.find(name) == tensors.end()) {
throw std::runtime_error("cannot find tensor with name: " + name);
}
auto len = ggml_nbytes(tensors[name]);
if (buf.size() < len) {
buf.resize(len);
}
auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
f_in.seekg(offset);
f_in.read((char* )buf.data(), len);
}
~file_input() {
gguf_free(ctx_gguf);
ggml_free(ctx_meta);
}
};
struct export_lora_params {
std::string fn_model_base;
std::string fn_model_out;
std::vector<struct lora_info> lora;
struct lora_merge_ctx {
// input base model + adapters
file_input base_model;
std::vector<std::unique_ptr<file_input>> adapters;
// for computing merged tensor
int n_threads;
ggml_backend_t backend = nullptr;
ggml_gallocr_t allocr = nullptr;
std::vector<uint8_t> read_buf;
// output file
struct gguf_context * ctx_out;
struct ggml_context * ctx_out_ggml;
std::ofstream fout;
lora_merge_ctx(
std::string & base_fname,
std::vector<std::tuple<std::string, float>> & lora_files,
std::string & outfile,
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
throw std::runtime_error("split model is not yet supported");
}
for (auto lora_inp : lora_files) {
auto fname = std::get<0>(lora_inp);
auto scale = std::get<1>(lora_inp);
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
check_metadata_lora(adapter.get());
adapters.push_back(std::move(adapter));
}
ctx_out = gguf_init_empty();
struct ggml_init_params params = {
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
ctx_out_ggml = ggml_init(params);
backend = ggml_backend_cpu_init();
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
}
struct lora_data {
struct lora_info info;
std::vector<uint8_t> data;
struct ggml_context * ctx;
void check_metadata_lora(file_input * adapter) {
auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
if (general_type != "adapter") {
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
}
uint32_t lora_r;
uint32_t lora_alpha;
};
auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
if (adapter_type != "lora") {
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture");
if (general_arch_base != general_arch_lora) {
throw std::runtime_error("model arch and LoRA arch mismatch");
}
}
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
size = 0;
ggml_type get_out_tensor_type(struct ggml_tensor * t) {
if (t->type == GGML_TYPE_F32) {
return GGML_TYPE_F32;
} else {
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
return GGML_TYPE_F16;
}
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
GGML_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void run_merge() {
// prepare metadata
gguf_set_kv(ctx_out, base_model.ctx_gguf);
// output is forced to f16 for now
gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
GGML_ASSERT(ret == 0); // same
// check if all lora adapters have the same tensors
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
if (adapters.size() > 1) {
for (size_t i = 1; i < adapters.size(); ++i) {
if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
throw std::runtime_error(err_no_subset_adapter);
}
void read_raw(void * ptr, size_t size) {
if (size == 0) {
return;
for (auto & it : adapters[i]->tensors) {
if (adapters[0]->get_tensor(it.first) == nullptr) {
throw std::runtime_error(err_no_subset_adapter);
}
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
die_fmt("read error: %s", strerror(errno));
}
if (ret != 1) {
die("unexpectedly reached end of file");
}
}
std::uint32_t read_u32() {
std::uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::string read_string(std::uint32_t len) {
std::vector<char> chars(len);
read_raw(chars.data(), len);
return std::string(chars.data(), len);
}
void write_raw(const void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, size, 1, fp);
if (ret != 1) {
die_fmt("write error: %s", strerror(errno));
}
}
void write_u32(std::uint32_t val) {
write_raw(&val, sizeof(val));
}
bool eof() {
return tell() >= size;
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
static struct export_lora_params get_default_export_lora_params() {
struct export_lora_params result;
result.fn_model_base = "";
result.fn_model_out = "";
result.n_threads = GGML_DEFAULT_N_THREADS;
return result;
}
static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str());
fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n");
fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads);
}
static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
bool invalid_param = false;
std::string arg;
struct export_lora_params default_params = get_default_export_lora_params();
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "-m" || arg == "--model-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_model_base = argv[i];
} else if (arg == "-o" || arg == "--model-out") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->fn_model_out = argv[i];
} else if (arg == "-l" || arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
struct lora_info lora;
lora.filename = argv[i];
lora.scale = 1.0f;
params->lora.push_back(lora);
} else if (arg == "-s" || arg == "--lora-scaled") {
if (++i >= argc) {
invalid_param = true;
break;
}
struct lora_info lora;
lora.filename = argv[i];
if (++i >= argc) {
invalid_param = true;
break;
}
lora.scale = std::stof(argv[i]);
params->lora.push_back(lora);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params->n_threads = std::stoi(argv[i]);
if (params->n_threads <= 0) {
params->n_threads = std::thread::hardware_concurrency();
// mapping base tensor to out tensor (same shape with base, but different type)
// if out_tensor == nullptr, we only copy it
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
for (auto & it : base_model.tensors) {
bool t_a = true;
bool t_b = true;
for (auto & adapter : adapters) {
t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
}
auto base_tensor = it.second;
if (!t_a && !t_b) {
// only copy
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
ggml_set_name(cpy_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
gguf_add_tensor(ctx_out, cpy_tensor);
} else if (t_a && t_b) {
// need merging
struct ggml_tensor * out_tensor = ggml_new_tensor(
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
ggml_set_name(out_tensor, base_tensor->name);
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
gguf_add_tensor(ctx_out, out_tensor);
} else {
fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
export_lora_print_usage(argc, argv, &default_params);
exit(1);
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
}
}
if (params->fn_model_base == default_params.fn_model_base) {
fprintf(stderr, "error: please specify a filename for model-base.\n");
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
if (params->fn_model_out == default_params.fn_model_out) {
fprintf(stderr, "error: please specify a filename for model-out.\n");
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
export_lora_print_usage(argc, argv, &default_params);
exit(1);
}
return true;
// placeholder for the meta data
{
size_t meta_size = gguf_get_meta_size(ctx_out);
zeros(fout, meta_size);
}
static void free_lora(struct lora_data * lora) {
if (lora->ctx != NULL) {
ggml_free(lora->ctx);
// process base model tensors
size_t n_merged = 0;
for (auto & it : base_to_out_tensors) {
if (it.second != nullptr) {
merge_tensor(it.first, it.second);
n_merged++;
} else {
copy_tensor(it.first);
}
delete lora;
}
static struct lora_data * load_lora(struct lora_info * info) {
struct lora_data * result = new struct lora_data;
result->info = *info;
result->ctx = NULL;
result->lora_r = 1;
result->lora_alpha = 1;
struct llama_file file(info->filename.c_str(), "rb");
if (file.fp == NULL) {
fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
info->filename.c_str());
free_lora(result);
return NULL;
// write output metadata
{
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.seekp(0);
fout.write((const char *)data.data(), data.size());
}
struct ggml_init_params params_ggml;
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
params_ggml.mem_buffer = NULL;
params_ggml.no_alloc = true;
result->ctx = ggml_init(params_ggml);
uint32_t magic = file.read_u32();
if (magic != LLAMA_FILE_MAGIC_GGLA) {
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
}
uint32_t version = file.read_u32();
if (version != 1) {
die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
}
result->lora_r = file.read_u32();
result->lora_alpha = file.read_u32();
// read tensor infos from file
std::vector<char> name_buf;
std::vector<struct ggml_tensor *> tensors;
std::vector<size_t> tensors_offset;
size_t total_nbytes_pad = 0;
while(!file.eof()) {
int64_t ne[4] = {1,1,1,1};
uint32_t n_dims = file.read_u32();
uint32_t namelen = file.read_u32();
uint32_t type = file.read_u32();
for (uint32_t k = 0; k < n_dims; ++k) {
ne[k] = (int64_t)file.read_u32();
}
name_buf.clear();
name_buf.resize(namelen + 1, '\0');
file.read_raw(name_buf.data(), namelen);
file.seek((0-file.tell()) & 31, SEEK_CUR);
size_t offset = file.tell();
struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
ggml_set_name(tensor, name_buf.data());
size_t nbytes = ggml_nbytes(tensor);
size_t nbytes_pad = ggml_nbytes_pad(tensor);
total_nbytes_pad += nbytes_pad;
tensors.push_back(tensor);
tensors_offset.push_back(offset);
file.seek(nbytes, SEEK_CUR);
}
// read tensor data
result->data.resize(total_nbytes_pad);
size_t data_offset = 0;
for (size_t i = 0; i < tensors.size(); ++i) {
struct ggml_tensor * tensor = tensors[i];
size_t offset = tensors_offset[i];
size_t nbytes = ggml_nbytes(tensor);
size_t nbytes_pad = ggml_nbytes_pad(tensor);
file.seek(offset, SEEK_SET);
tensor->data = result->data.data() + data_offset;
file.read_raw(tensor->data, nbytes);
data_offset += nbytes_pad;
}
return result;
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
}
static struct ggml_cgraph * build_graph_lora(
struct ggml_context * ctx,
struct ggml_tensor * tensor,
struct ggml_tensor * lora_a,
struct ggml_tensor * lora_b,
float scaling
) {
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
if (scaling != 1.0f) {
ab = ggml_scale(ctx, ab, scaling);
}
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand (gf, res);
return gf;
void copy_tensor(struct ggml_tensor * base) {
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
size_t len = ggml_nbytes(base);
base_model.read_tensor_data(base->name, read_buf);
fout.write((char* )read_buf.data(), len);
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
}
static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
if (lora->ctx == NULL) {
return false;
void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
std::string name_base(base->name);
std::string name_lora_a = name_base + ".lora_a";
std::string name_lora_b = name_base + ".lora_b";
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
// context for input tensor
std::vector<struct ggml_tensor *> inp_a(adapters.size());
std::vector<struct ggml_tensor *> inp_b(adapters.size());
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
struct ggml_context * ctx = ggml_init(params);
// alloc tensors
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
for (size_t i = 0; i < adapters.size(); ++i) {
auto t_a = adapters[i]->get_tensor(name_lora_a);
auto t_b = adapters[i]->get_tensor(name_lora_b);
inp_a[i] = ggml_dup_tensor(ctx, t_a);
inp_b[i] = ggml_dup_tensor(ctx, t_b);
}
std::string name = ggml_get_name(tensor);
std::string name_a = name + std::string(".loraA");
std::string name_b = name + std::string(".loraB");
struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
if (lora_a == NULL || lora_b == NULL) {
return false;
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
// load base tensor to backend buffer
base_model.read_tensor_data(name_base, read_buf);
if (base->type != GGML_TYPE_F32) {
// optionally dequantize it
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
auto nels = ggml_nelements(inp_base);
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
}
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
// load lora tensors to backend buffer
for (size_t i = 0; i < adapters.size(); ++i) {
adapters[i]->read_tensor_data(name_lora_a, read_buf);
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
adapters[i]->read_tensor_data(name_lora_b, read_buf);
ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
}
struct ggml_init_params params;
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
params.mem_buffer = NULL;
params.no_alloc = true;
struct ggml_context * ctx = NULL;
struct ggml_gallocr * alloc = NULL;
struct ggml_cgraph * gf = NULL;
// build graph
struct ggml_cgraph * gf;
{
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true,
};
struct ggml_context * ctx0 = ggml_init(params0);
gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur = inp_base;
for (size_t i = 0; i < adapters.size(); ++i) {
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
// scale
const float alpha = adapters[i]->alpha;
const float rank = (float) inp_b[i]->ne[0];
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
delta = ggml_scale(ctx0, delta, scale);
cur = ggml_add(ctx0, delta, cur);
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
}
cur = ggml_cast(ctx0, cur, out->type);
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
}
ctx = ggml_init(params);
alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
// compute
{
ggml_gallocr_alloc_graph(allocr, gf);
ggml_backend_cpu_set_n_threads(backend, n_threads);
ggml_backend_graph_compute(backend, gf);
}
ggml_gallocr_alloc_graph(alloc, gf);
// write data to output file
{
auto result = gf->nodes[gf->n_nodes - 1];
size_t len = ggml_nbytes(result);
if (read_buf.size() < len) {
read_buf.resize(len);
}
ggml_backend_tensor_get(result, read_buf.data(), 0, len);
fout.write((char* )read_buf.data(), len);
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
}
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
static std::vector<uint8_t> data_work;
data_work.resize(cplan.work_size);
cplan.work_data = data_work.data();
ggml_graph_compute(gf, &cplan);
ggml_gallocr_free(alloc);
ggml_free(ctx);
return true;
ggml_backend_buffer_free(buffer);
}
static void export_lora(struct export_lora_params * params) {
// load all loras
std::vector<struct lora_data *> loras;
for (size_t i = 0; i < params->lora.size(); ++i) {
struct lora_data * lora = load_lora(&params->lora[i]);
if (lora != NULL) {
loras.push_back(lora);
}
}
if (loras.size() == 0) {
fprintf(stderr, "warning: no lora adapters will be applied.\n");
~lora_merge_ctx() {
ggml_gallocr_free(allocr);
ggml_backend_free(backend);
gguf_free(ctx_out);
ggml_free(ctx_out_ggml);
}
};
// open input file
struct llama_file fin(params->fn_model_base.c_str(), "rb");
if (!fin.fp) {
die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
// open base model gguf, read tensors without their data
struct ggml_context * ctx_in;
struct gguf_init_params params_gguf;
params_gguf.no_alloc = true;
params_gguf.ctx = &ctx_in;
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
// create new gguf
struct gguf_context * gguf_out = gguf_init_empty();
// copy meta data from base model: kv and tensors
gguf_set_kv(gguf_out, gguf_in);
int n_tensors = gguf_get_n_tensors(gguf_in);
for (int i=0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(gguf_in, i);
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
gguf_add_tensor(gguf_out, tensor);
}
// create output file
struct llama_file fout(params->fn_model_out.c_str(), "wb");
if (!fout.fp) {
die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
}
// write gguf meta data
std::vector<uint8_t> meta;
meta.resize(gguf_get_meta_size(gguf_out));
gguf_get_meta_data(gguf_out, meta.data());
fout.write_raw(meta.data(), meta.size());
std::vector<uint8_t> data;
std::vector<uint8_t> padding;
for (int i=0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name(gguf_in, i);
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
// read tensor data
data.resize(ggml_nbytes(tensor));
tensor->data = data.data();
size_t offset = gguf_get_tensor_offset(gguf_in, i);
fin.seek(offset + meta.size(), SEEK_SET);
fin.read_raw(data.data(), data.size());
// apply all loras
for (size_t k = 0; k < loras.size(); ++k) {
apply_lora(tensor, loras[k], params->n_threads);
}
// write tensor data + padding
padding.clear();
padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
GGML_ASSERT(fout.tell() == offset + meta.size());
// fout.seek(offset + meta.size(), SEEK_SET);
fout.write_raw(data.data(), data.size());
fout.write_raw(padding.data(), padding.size());
if (i % 2 == 0) {
printf(".");
}
}
printf("\nexample usage:\n");
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
printf("\nNOTE: output model is F16\n");
printf("\n");
// close gguf
gguf_free(gguf_out);
gguf_free(gguf_in);
// free loras
for (size_t i = 0; i < loras.size(); ++i) {
free_lora(loras[i]);
}
}
int main(int argc, char ** argv) {
struct export_lora_params params = get_default_export_lora_params();
gpt_params params;
if (!export_lora_params_parse(argc, argv, &params)) {
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
export_lora(&params);
g_verbose = (params.verbosity == 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());
exit(EXIT_FAILURE);
}
printf("done, output file is %s\n", params.lora_outfile.c_str());
return 0;
}

View file

@ -1,5 +0,0 @@
set(TARGET llama-finetune)
add_executable(${TARGET} finetune.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,90 +0,0 @@
# finetune
Basic usage instructions:
```bash
# get training data
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# finetune LORA adapter
./bin/llama-finetune \
--model-base open-llama-3b-v2-q8_0.gguf \
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
--lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
--train-data "shakespeare.txt" \
--save-every 10 \
--threads 6 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing
# predict
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
```
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
So in above example after 10 iterations these files will be written:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
After 10 more iterations:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
```bash
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
```
You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
```bash
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
```
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
The default LORA rank can be specified with `--lora-r N`.
The LORA rank can be configured for each model tensor type separately with these command line options:
```bash
--lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
--rank-att-norm N LORA rank for attention norm tensor (default 1)
--rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1)
--rank-out-norm N LORA rank for output norm tensor (default 1)
--rank-tok-embd N LORA rank for token embeddings tensor (default 4)
--rank-out N LORA rank for output tensor (default 4)
--rank-wq N LORA rank for wq tensor (default 4)
--rank-wk N LORA rank for wk tensor (default 4)
--rank-wv N LORA rank for wv tensor (default 4)
--rank-wo N LORA rank for wo tensor (default 4)
--rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
--rank-ffn_down N LORA rank for ffn_down tensor (default 4)
--rank-ffn_up N LORA rank for ffn_up tensor (default 4)
```
The LORA rank of 'norm' tensors should always be 1.
To see all available options use `llama-finetune --help`.

View file

@ -1,487 +0,0 @@
#!/usr/bin/env python3
# finetune checkpoint --> gguf conversion
import argparse
import gguf
import struct
import numpy as np
from pathlib import Path
# gguf constants
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
LLM_KV_TRAINING_TYPE = "training.type"
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
class Tensor:
def __init__(self, dtype='f', ne=None):
if ne is None:
ne = []
self.dtype = dtype
self.ne = ne
self.nbytes = 0
if self.dtype == 'f':
if len(self.ne) == 0:
self.nbytes = 0
else:
self.nbytes = int(np.prod(self.ne)) * 4
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
def load(self, data, offset):
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
assert(nd == len(self.ne))
ne = []
for d in range(nd):
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
ne.append(n)
if tuple(ne) != tuple(self.ne):
raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
if self.dtype == 'f':
assert(dtype == 0)
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
self.name = bytes(data[offset:offset+namelen]); offset += namelen
# 32-byte alignment
offset += (0 - offset) & 31
self.data = data[offset:offset+self.nbytes]
offset += self.nbytes
return offset
def max_storage_size(self):
result = 0
result += 4 # nd
result += 4 # namelen
result += 4 # dtype
result += len(self.ne)*8 # ne
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
result += 31 # 32-byte alignment
result += self.nbytes
return result
def save_gguf(self, gguf_writer, name):
gguf_writer.add_tensor(
name=name,
tensor=self.data,
raw_shape=np.array(list(reversed(self.ne))),
raw_dtype=gguf.GGMLQuantizationType.F32)
class OptimizationContext:
def __init__(self):
pass
def load(self, data, offset):
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
offset += 4
if self.version != 1:
raise ValueError('Invalid version of optimization context in checkpoint file')
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
self.adam_m = Tensor('f', [self.nx])
self.adam_v = Tensor('f', [self.nx])
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_x = Tensor('f', [self.nx])
self.lbfgs_xp = Tensor('f', [self.nx])
self.lbfgs_g = Tensor('f', [self.nx])
self.lbfgs_gp = Tensor('f', [self.nx])
self.lbfgs_d = Tensor('f', [self.nx])
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
# forgot to save type in version 1:
# guess self.type from number of remaining bytes
size_type_0 = 12 + sum([t.max_storage_size() for t in
[self.adam_m, self.adam_v]
+([self.adam_pf] if (self.past > 0) else [])])
size_type_1 = 24 + sum([t.max_storage_size() for t in
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
self.lbfgs_lmal, self.lbfgs_lmys,
self.lbfgs_lms, self.lbfgs_lmy]
+([self.lbfgs_pf] if (self.past > 0) else [])])
# due to alignment padding the size might not by exact
# but the difference in size for both types is significant,
# so we can just use whichever is closest
remaining = len(data) - offset
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
self.type = 0
else:
self.type = 1
if self.type == 0:
offset = self.adam_m.load(data, offset)
offset = self.adam_v.load(data, offset)
offset = self.adam_pf.load(data,offset)
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
elif self.type == 1:
offset = self.lbfgs_x.load(data, offset)
offset = self.lbfgs_xp.load(data, offset)
offset = self.lbfgs_g.load(data, offset)
offset = self.lbfgs_gp.load(data, offset)
offset = self.lbfgs_d.load(data, offset)
offset = self.lbfgs_pf.load(data, offset)
offset = self.lbfgs_lmal.load(data, offset)
offset = self.lbfgs_lmys.load(data, offset)
offset = self.lbfgs_lms.load(data, offset)
offset = self.lbfgs_lmy.load(data, offset)
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
else:
raise ValueError(f"Invalid optimizer type '{self.type}'")
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
if self.type == 0:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
if self.past > 0:
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
elif self.type == 1:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
if self.past > 0:
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
else:
raise ValueError('Unknown optimizer type')
class LoraParams:
def __init__(self):
pass
def load(self, data, offset):
self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
class ModelParams:
def __init__(self, n_ff = None):
self.n_ff = n_ff
def load(self, data, offset):
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
def get_n_ff(self):
if self.n_ff is None:
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
else:
return self.n_ff
def save_gguf(self, gguf_writer):
# self.n_vocab not saved
gguf_writer.add_embedding_length(self.n_embd)
gguf_writer.add_head_count(self.n_head)
gguf_writer.add_block_count(self.n_layer)
gguf_writer.add_rope_dimension_count(self.n_rot)
gguf_writer.add_feed_forward_length(self.get_n_ff())
def tensor_name(key, bid=None, suffix=".weight"):
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
class Layer:
def __init__(self, params, lora_params, bid):
self.bid = bid
self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
def load(self, data, offset):
offset = self.att_norm_a.load(data, offset)
offset = self.att_norm_b.load(data, offset)
offset = self.wq_a.load(data, offset)
offset = self.wq_b.load(data, offset)
offset = self.wk_a.load(data, offset)
offset = self.wk_b.load(data, offset)
offset = self.wv_a.load(data, offset)
offset = self.wv_b.load(data, offset)
offset = self.wo_a.load(data, offset)
offset = self.wo_b.load(data, offset)
offset = self.ffn_norm_a.load(data, offset)
offset = self.ffn_norm_b.load(data, offset)
offset = self.w1_a.load(data, offset)
offset = self.w1_b.load(data, offset)
offset = self.w2_a.load(data, offset)
offset = self.w2_b.load(data, offset)
offset = self.w3_a.load(data, offset)
offset = self.w3_b.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
class LoraModel:
def __init__(self, n_ff = None):
self.params = ModelParams(n_ff = n_ff)
self.lora_params = LoraParams()
self.layers = []
def load(self, data, offset):
offset = self.params.load(data, offset)
offset = self.lora_params.load(data, offset)
self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
offset = self.tok_embd_a.load(data, offset)
offset = self.tok_embd_b.load(data, offset)
offset = self.norm_a.load(data, offset)
offset = self.norm_b.load(data, offset)
offset = self.output_a.load(data, offset)
offset = self.output_b.load(data, offset)
self.layers.clear()
for bid in range(self.params.n_layer):
layer = Layer(self.params, self.lora_params, bid)
offset = layer.load(data, offset)
self.layers.append(layer)
return offset
def save_gguf(self, gguf_writer):
self.params.save_gguf(gguf_writer)
self.lora_params.save_gguf(gguf_writer)
self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
for layer in self.layers:
layer.save_gguf(gguf_writer)
class LoraCheckpoint:
def __init__(self, n_ff = None):
self.model = LoraModel(n_ff = n_ff)
self.opt_ctx = OptimizationContext()
def load(self, data, offset):
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
if magic != b'ggcl':
raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
if self.version != 0:
raise ValueError('Invalid version of checkpoint file')
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
offset = self.model.load(data, offset)
offset = self.opt_ctx.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
gguf_writer.add_layer_norm_rms_eps(1e-5)
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
self.model.save_gguf(gguf_writer)
self.opt_ctx.save_gguf(gguf_writer)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
return parser.parse_args()
def main():
cfg = handle_args()
print(cfg)
data = np.memmap(cfg.input, mode = 'r')
chk = LoraCheckpoint(n_ff = cfg.ff)
offset = 0
offset = chk.load(data, offset)
# we should have read all available data
assert(offset == len(data))
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
chk.save_gguf(gguf_writer)
print(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load diff

View file

@ -1,34 +0,0 @@
#!/bin/bash
cd `dirname $0`
cd ../..
EXE="./llama-finetune"
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
while getopts "dg" opt; do
case $opt in
d)
DEBUGGER="gdb --args"
;;
g)
EXE="./build/bin/Release/finetune"
GPUARG="--gpu-layers 25"
;;
esac
done
$DEBUGGER $EXE \
--model-base $MODEL \
$GPUARG \
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
--save-every 10 \
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
--use-checkpointing

View file

@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
auto decoded = decode_utf8(input_str, {});
const auto & code_points = decoded.first;
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
size_t pos = 0;
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
auto prev_stacks = grammar->stacks;
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
if (grammar->stacks.empty()) {
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
if (cur_stacks.empty()) {
error_pos = pos;
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
grammar->stacks = prev_stacks;
cur_stacks = prev_stacks;
return false;
}
++pos;
}
for (const auto & stack : grammar->stacks) {
for (const auto & stack : cur_stacks) {
if (stack.empty()) {
return true;
}

View file

@ -201,6 +201,6 @@ Verification results for test.gguf.manifest - Success
These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
- https://github.com/Cyan4973/xxHash
- https://github.com/clibs/sha1/
- https://github.com/jb55/sha256.c

View file

@ -1,7 +1,7 @@
{
"name": "xxhash",
"version": "0.8.2",
"repo": "mofosyne/xxhash",
"repo": "Cyan4973/xxhash",
"description": "Extremely fast non-cryptographic hash algorithm",
"keywords": ["xxhash", "hashing"],
"license": "BSD-2-Clause",

View file

@ -347,7 +347,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
@ -384,7 +384,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
char hex_result[41] = {0};
for (int offset = 0; offset < 20; offset++) {
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
@ -421,7 +421,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
@ -460,7 +460,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
char hex_result[17];
for (int offset = 0; offset < 8; offset++) {
unsigned int shift_bits_by = (8 * (8 - offset - 1));
sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
}
if (hash_params.manifest_is_usable) {
@ -490,7 +490,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
char hex_result[41];
for (int offset = 0; offset < 20; offset++) {
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
@ -520,7 +520,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
for (int offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
}
if (hash_params.manifest_is_usable) {
@ -552,7 +552,7 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
generate_uuidv5(result, uuid);
char string_buffer[37] = {0};
sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
uuid[0], uuid[1], uuid[2], uuid[3],
uuid[4], uuid[5], uuid[6], uuid[7],
uuid[8], uuid[9], uuid[10], uuid[11],

View file

@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
if (!ctx) {
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
return false;
}
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

View file

@ -1,6 +1,6 @@
# llama.cpp/examples/imatrix
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
## Usage

View file

@ -23,6 +23,10 @@
#include "ggml-cuda.h"
#include "ggml-sycl.h"
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
// utils
static uint64_t get_time_ns() {
using clock = std::chrono::high_resolution_clock;
@ -120,6 +124,17 @@ static std::string get_gpu_info() {
id += "/";
}
}
#endif
#ifdef GGML_USE_CANN
uint32_t count = ggml_backend_cann_get_device_count();
for (uint32_t i = 0; i < count; i++) {
char buf[128];
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
// TODO: other backends
return id;

View file

@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
return env->NewStringUTF("");
return nullptr;
}
auto new_token_chars = llama_token_to_piece(context, new_token_id);

View file

@ -26,11 +26,12 @@ actor LlamaContext {
private var context: OpaquePointer
private var batch: llama_batch
private var tokens_list: [llama_token]
var is_done: Bool = false
/// This variable is used to store temporarily invalid cchars
private var temporary_invalid_cchars: [CChar]
var n_len: Int32 = 64
var n_len: Int32 = 1024
var n_cur: Int32 = 0
var n_decode: Int32 = 0
@ -160,6 +161,7 @@ actor LlamaContext {
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
print("\n")
is_done = true
let new_token_str = String(cString: temporary_invalid_cchars + [0])
temporary_invalid_cchars.removeAll()
return new_token_str

View file

@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
messageLog += "\(text)"
Task.detached {
while await llamaContext.n_cur < llamaContext.n_len {
while await !llamaContext.is_done {
let result = await llamaContext.completion_loop()
await MainActor.run {
self.messageLog += "\(result)"

View file

@ -16,6 +16,10 @@
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
#endif
#ifdef GGML_USE_CANN
new_clip->backend = ggml_backend_cann_init(0);
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
#endif
if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();

View file

@ -31,7 +31,6 @@ int main(int argc, char ** argv){
// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt
std::vector<llama_token> inp;
@ -65,7 +64,7 @@ int main(int argc, char ** argv){
}
const int n_input = inp.size();
const int n_ctx = params.n_ctx;
const int n_ctx = llama_n_ctx(ctx);
int n_drafted = 0;
int n_accept = 0;

View file

@ -39,7 +39,6 @@ int main(int argc, char ** argv){
// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
// tokenize the prompt
std::vector<llama_token> inp;

View file

@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
auto formatted = llama_chat_format_single(
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content});
LOG("formatted: %s\n", formatted.c_str());
return formatted;
}
@ -289,8 +290,13 @@ int main(int argc, char ** argv) {
// Should not run without any tokens
if (embd_inp.empty()) {
if (add_bos) {
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
} else {
LOG_TEE("error: input is empty\n");
return -1;
}
}
// Tokenize negative prompt

View file

@ -6,7 +6,7 @@ import re
from copy import copy
from enum import Enum
from inspect import getdoc, isclass
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
from docstring_parser import parse
from pydantic import BaseModel, create_model
@ -53,35 +53,38 @@ class PydanticDataType(Enum):
def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
if isclass(pydantic_type) and issubclass(pydantic_type, str):
origin_type = get_origin(pydantic_type)
origin_type = pydantic_type if origin_type is None else origin_type
if isclass(origin_type) and issubclass(origin_type, str):
return PydanticDataType.STRING.value
elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
elif isclass(origin_type) and issubclass(origin_type, bool):
return PydanticDataType.BOOLEAN.value
elif isclass(pydantic_type) and issubclass(pydantic_type, int):
elif isclass(origin_type) and issubclass(origin_type, int):
return PydanticDataType.INTEGER.value
elif isclass(pydantic_type) and issubclass(pydantic_type, float):
elif isclass(origin_type) and issubclass(origin_type, float):
return PydanticDataType.FLOAT.value
elif isclass(pydantic_type) and issubclass(pydantic_type, Enum):
elif isclass(origin_type) and issubclass(origin_type, Enum):
return PydanticDataType.ENUM.value
elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
return format_model_and_field_name(pydantic_type.__name__)
elif get_origin(pydantic_type) is list:
elif isclass(origin_type) and issubclass(origin_type, BaseModel):
return format_model_and_field_name(origin_type.__name__)
elif origin_type is list:
element_type = get_args(pydantic_type)[0]
return f"{map_pydantic_type_to_gbnf(element_type)}-list"
elif get_origin(pydantic_type) is set:
elif origin_type is set:
element_type = get_args(pydantic_type)[0]
return f"{map_pydantic_type_to_gbnf(element_type)}-set"
elif get_origin(pydantic_type) is Union:
elif origin_type is Union:
union_types = get_args(pydantic_type)
union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
return f"union-{'-or-'.join(union_rules)}"
elif get_origin(pydantic_type) is Optional:
elif origin_type is Optional:
element_type = get_args(pydantic_type)[0]
return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
elif isclass(pydantic_type):
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
elif get_origin(pydantic_type) is dict:
elif isclass(origin_type):
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(origin_type.__name__)}"
elif origin_type is dict:
key_type, value_type = get_args(pydantic_type)
return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
else:
@ -118,7 +121,7 @@ def get_members_structure(cls, rule_name):
# Modify this comprehension
members = [
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}'
for name, param_type in cls.__annotations__.items()
for name, param_type in get_type_hints(cls).items()
if name != "self"
]
@ -297,17 +300,20 @@ def generate_gbnf_rule_for_type(
field_name = format_model_and_field_name(field_name)
gbnf_type = map_pydantic_type_to_gbnf(field_type)
if isclass(field_type) and issubclass(field_type, BaseModel):
origin_type = get_origin(field_type)
origin_type = field_type if origin_type is None else origin_type
if isclass(origin_type) and issubclass(origin_type, BaseModel):
nested_model_name = format_model_and_field_name(field_type.__name__)
nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules)
rules.extend(nested_model_rules)
gbnf_type, rules = nested_model_name, rules
elif isclass(field_type) and issubclass(field_type, Enum):
elif isclass(origin_type) and issubclass(origin_type, Enum):
enum_values = [f'"\\"{e.value}\\""' for e in field_type] # Adding escaped quotes
enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
rules.append(enum_rule)
gbnf_type, rules = model_name + "-" + field_name, rules
elif get_origin(field_type) == list: # Array
elif origin_type is list: # Array
element_type = get_args(field_type)[0]
element_rule_name, additional_rules = generate_gbnf_rule_for_type(
model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
@ -317,7 +323,7 @@ def generate_gbnf_rule_for_type(
rules.append(array_rule)
gbnf_type, rules = model_name + "-" + field_name, rules
elif get_origin(field_type) == set or field_type == set: # Array
elif origin_type is set: # Array
element_type = get_args(field_type)[0]
element_rule_name, additional_rules = generate_gbnf_rule_for_type(
model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
@ -371,7 +377,7 @@ def generate_gbnf_rule_for_type(
gbnf_type = f"{model_name}-{field_name}-optional"
else:
gbnf_type = f"{model_name}-{field_name}-union"
elif isclass(field_type) and issubclass(field_type, str):
elif isclass(origin_type) and issubclass(origin_type, str):
if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None:
triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False)
markdown_string = field_info.json_schema_extra.get("markdown_code_block", False)
@ -387,8 +393,8 @@ def generate_gbnf_rule_for_type(
gbnf_type = PydanticDataType.STRING.value
elif (
isclass(field_type)
and issubclass(field_type, float)
isclass(origin_type)
and issubclass(origin_type, float)
and field_info
and hasattr(field_info, "json_schema_extra")
and field_info.json_schema_extra is not None
@ -413,8 +419,8 @@ def generate_gbnf_rule_for_type(
)
elif (
isclass(field_type)
and issubclass(field_type, int)
isclass(origin_type)
and issubclass(origin_type, int)
and field_info
and hasattr(field_info, "json_schema_extra")
and field_info.json_schema_extra is not None
@ -462,7 +468,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
if not issubclass(model, BaseModel):
# For non-Pydantic classes, generate model_fields from __annotations__ or __init__
if hasattr(model, "__annotations__") and model.__annotations__:
model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()} # pyright: ignore[reportGeneralTypeIssues]
model_fields = {name: (typ, ...) for name, typ in get_type_hints(model).items()}
else:
init_signature = inspect.signature(model.__init__)
parameters = init_signature.parameters
@ -470,7 +476,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
name != "self"}
else:
# For Pydantic models, use model_fields and check for ellipsis (required fields)
model_fields = model.__annotations__
model_fields = get_type_hints(model)
model_rule_parts = []
nested_rules = []
@ -706,7 +712,7 @@ def generate_markdown_documentation(
else:
documentation += f" Fields:\n" # noqa: F541
if isclass(model) and issubclass(model, BaseModel):
for name, field_type in model.__annotations__.items():
for name, field_type in get_type_hints(model).items():
# if name == "markdown_code_block":
# continue
if get_origin(field_type) == list:
@ -754,14 +760,17 @@ def generate_field_markdown(
field_info = model.model_fields.get(field_name)
field_description = field_info.description if field_info and field_info.description else ""
if get_origin(field_type) == list:
origin_type = get_origin(field_type)
origin_type = field_type if origin_type is None else origin_type
if origin_type == list:
element_type = get_args(field_type)[0]
field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
if field_description != "":
field_text += ":\n"
else:
field_text += "\n"
elif get_origin(field_type) == Union:
elif origin_type == Union:
element_types = get_args(field_type)
types = []
for element_type in element_types:
@ -792,9 +801,9 @@ def generate_field_markdown(
example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
field_text += f"{indent} Example: {example_text}\n"
if isclass(field_type) and issubclass(field_type, BaseModel):
if isclass(origin_type) and issubclass(origin_type, BaseModel):
field_text += f"{indent} Details:\n"
for name, type_ in field_type.__annotations__.items():
for name, type_ in get_type_hints(field_type).items():
field_text += generate_field_markdown(name, type_, field_type, depth + 2)
return field_text
@ -855,7 +864,7 @@ def generate_text_documentation(
if isclass(model) and issubclass(model, BaseModel):
documentation_fields = ""
for name, field_type in model.__annotations__.items():
for name, field_type in get_type_hints(model).items():
# if name == "markdown_code_block":
# continue
if get_origin(field_type) == list:
@ -948,7 +957,7 @@ def generate_field_text(
if isclass(field_type) and issubclass(field_type, BaseModel):
field_text += f"{indent} Details:\n"
for name, type_ in field_type.__annotations__.items():
for name, type_ in get_type_hints(field_type).items():
field_text += generate_field_text(name, type_, field_type, depth + 2)
return field_text

289
examples/pydantic_models_to_grammar_examples.py Normal file → Executable file
View file

@ -1,8 +1,15 @@
# Function calling example using pydantic models.
#!/usr/bin/env python3
"""Function calling example using pydantic models."""
from __future__ import annotations
import argparse
import datetime
import json
import logging
import textwrap
import sys
from enum import Enum
from typing import Optional, Union
@ -12,28 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
# Function to get completion on the llama.cpp server with grammar.
def create_completion(prompt, grammar):
def create_completion(host, prompt, gbnf_grammar):
"""Calls the /completion API on llama-server.
See
https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
"""
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
headers = {"Content-Type": "application/json"}
data = {"prompt": prompt, "grammar": grammar}
response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
data = response.json()
print(data["content"])
return data["content"]
data = {"prompt": prompt, "grammar": gbnf_grammar}
result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
assert data.get("error") is None, data
logging.info("Result: %s", result)
content = result["content"]
print(f" Model: {result['model']}")
print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}")
return content
# A function for the agent to send a message to the user.
class SendMessageToUser(BaseModel):
"""
Send a message to the User.
"""
"""Send a message to the User."""
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
message: str = Field(..., description="Message you want to send to the user.")
def run(self):
print(self.message)
print(f"SendMessageToUser: {self.message}")
def example_rce(host):
"""Minimal test case where the LLM call an arbitrary python function."""
print("- example_rce")
tools = [SendMessageToUser]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
tools_map = {tool.__name__:tool for tool in tools}
# This finds "SendMessageToUser":
tool = tools_map.get(json_data["function"])
if not tool:
print(f"Error: unknown tool {json_data['function']}")
return 1
tool(**json_data["function_parameters"]).run()
return 0
# Enum for the calculator tool.
@ -44,11 +77,11 @@ class MathOperation(Enum):
DIVIDE = "divide"
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
# Simple pydantic calculator tool for the agent that can add, subtract,
# multiply, and divide. Docstring and description of fields will be used in
# system prompt.
class Calculator(BaseModel):
"""
Perform a math operation on two numbers.
"""
"""Perform a math operation on two numbers."""
number_one: Union[int, float] = Field(..., description="First number.")
operation: MathOperation = Field(..., description="Math operation to perform.")
number_two: Union[int, float] = Field(..., description="Second number.")
@ -66,55 +99,61 @@ class Calculator(BaseModel):
raise ValueError("Unknown operation.")
# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
# pydantic_model_list is the list of pydanitc models
# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
# outer_object_content is the name of outer object content.
# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
def example_calculator(host):
"""Have the LLM ask to get a calculation done.
Here the grammar gets generated by passing the available function models to
generate_gbnf_grammar_and_documentation function. This also generates a
documentation usable by the LLM.
pydantic_model_list is the list of pydantic models outer_object_name is an
optional name for an outer object around the actual model object. Like a
"function" object with "function_parameters" which contains the actual model
object. If None, no outer object will be generated outer_object_content is
the name of outer object content.
model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
"""
print("- example_calculator")
tools = [SendMessageToUser, Calculator]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
print(gbnf_grammar)
print(documentation)
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
user_message = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
# This should output something like this:
# {
# "function": "calculator",
# "function_parameters": {
# "number_one": 42,
# "operation": "multiply",
# "number_two": 42
# }
# }
function_dictionary = json.loads(text)
if function_dictionary["function"] == "calculator":
function_parameters = {**function_dictionary["function_parameters"]}
print(Calculator(**function_parameters).run())
# This should output: 1764
user_message1 = "What is 42 * 42?"
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
expected = {
"function": "Calculator",
"function_parameters": {
"number_one": 42,
"operation": "multiply",
"number_two": 42
}
}
if json_data != expected:
print(" Result is not as expected!")
tools_map = {tool.__name__:tool for tool in tools}
# This finds "Calculator":
tool = tools_map.get(json_data["function"])
if not tool:
print(f"Error: unknown tool {json_data['function']}")
return 1
result = tool(**json_data["function_parameters"]).run()
print(f" Call {json_data['function']} gave result {result}")
return 0
# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
class Category(Enum):
"""
The category of the book.
"""
"""The category of the book."""
Fiction = "Fiction"
NonFiction = "Non-Fiction"
class Book(BaseModel):
"""
Represents an entry about a book.
"""
"""Represents an entry about a book."""
title: str = Field(..., description="Title of the book.")
author: str = Field(..., description="Author of the book.")
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
@ -123,33 +162,42 @@ class Book(BaseModel):
summary: str = Field(..., description="Summary of the book.")
# We need no additional parameters other than our list of pydantic models.
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
def example_struct(host):
"""A example structured output based on pydantic models.
The LLM will create an entry for a Book database out of an unstructured
text. We need no additional parameters other than our list of pydantic
models.
"""
print("- example_struct")
tools = [Book]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 19611963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
# In this case, there's no function nor function_parameters.
# Here the result will vary based on the LLM used.
keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
if keys != sorted(json_data.keys()):
print(f"Unexpected result: {sorted(json_data.keys())}")
return 1
book = Book(**json_data)
print(f" As a Book object: %s" % book)
return 0
print(Book(**json_data))
# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
def get_current_datetime(output_format: Optional[str] = None):
"""
Get the current date and time in the given format.
"""Get the current date and time in the given format.
Args:
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
"""
if output_format is None:
output_format = '%Y-%m-%d %H:%M:%S'
return datetime.datetime.now().strftime(output_format)
return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
# Example function to get the weather
# Example function to get the weather.
def get_current_weather(location, unit):
"""Get the current weather in a given location"""
if "London" in location:
@ -158,11 +206,15 @@ def get_current_weather(location, unit):
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
elif "North Pole" in location:
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
else:
return json.dumps({"location": location, "temperature": "unknown"})
# Here is a function definition in OpenAI style
def example_concurrent(host):
"""An example for parallel function calling with a Python function, a pydantic
function model and an OpenAI like function definition.
"""
print("- example_concurrent")
# Function definition in OpenAI style.
current_weather_tool = {
"type": "function",
"function": {
@ -181,45 +233,80 @@ current_weather_tool = {
},
},
}
# Convert OpenAI function definition into pydantic model
# Convert OpenAI function definition into pydantic model.
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
# Add the actual function to a pydantic model
# Add the actual function to a pydantic model.
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
# Convert normal Python function to a pydantic model
# Convert normal Python function to a pydantic model.
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
pydantic_model_list=tool_list, outer_object_name="function",
pydantic_model_list=tools, outer_object_name="function",
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
text = create_completion(host, prompt, gbnf_grammar)
json_data = json.loads(text)
print(json_data)
# Should output something like this:
# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
expected = [
{
"function": "get_current_datetime",
"params": {
"output_format": "%Y-%m-%d %H:%M:%S"
}
},
{
"function": "get_current_weather",
"params": {
"location": "London",
"unit": "celsius"
}
},
{
"function": "Calculator",
"params": {
"number_one": 42,
"operation": "multiply",
"number_two": 42
}
}
]
res = 0
if json_data != expected:
print(" Result is not as expected!")
print(" This can happen on highly quantized models")
res = 1
tools_map = {tool.__name__:tool for tool in tools}
for call in json_data:
if call["function"] == "Calculator":
print(Calculator(**call["params"]).run())
elif call["function"] == "get_current_datetime":
print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
elif call["function"] == "get_current_weather":
print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
tool = tools_map.get(call["function"])
if not tool:
print(f"Error: unknown tool {call['function']}")
return 1
result = tool(**call["params"]).run()
print(f" Call {call['function']} returned {result}")
# Should output something like this:
# 2024-01-14 13:36:06
# {"location": "London", "temperature": "42", "unit": "celsius"}
# 1764
# Call get_current_datetime returned 2024-07-15 09:50:38
# Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
# Call Calculator returned 1764
return res
def main():
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
ret = 0
# Comment out below to only run the example you want.
ret = ret or example_rce(args.host)
ret = ret or example_calculator(args.host)
ret = ret or example_struct(args.host)
ret = ret or example_concurrent(args.host)
return ret
if __name__ == "__main__":
sys.exit(main())

View file

@ -154,7 +154,7 @@ static void test_roundtrip_on_chunk(
}
if (use_reference) {
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
}

View file

@ -46,6 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

View file

@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
**Features:**
* LLM inference of F16 and quantum models on GPU and CPU
* LLM inference of F16 and quantized models on GPU and CPU
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
* Parallel decoding with multi-user support
* Continuous batching
@ -15,69 +15,281 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
**Command line options:**
## Usage
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused
- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`.
- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`
- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512`
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
- `--numa distribute`: Spread execution evenly over all nodes
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa`: Attempt optimizations that may help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`
- `--port`: Set the port to listen. Default: `8080`
- `--path`: Path from which to serve static files. Default: disabled
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled.
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512`
- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
```
usage: ./llama-server [options]
general:
-h, --help, --usage print usage and exit
--version show version and build info
-v, --verbose print verbose information
--verbosity N set specific verbosity level (default: 0)
--verbose-prompt print a verbose prompt before generation (default: false)
--no-display-prompt don't print prompt at generation (default: false)
-co, --color colorise output to distinguish prompt and user input from generations (default: false)
-s, --seed SEED RNG seed (default: -1, use random seed for < 0)
-t, --threads N number of threads to use during generation (default: 8)
-tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)
-td, --threads-draft N number of threads to use during generation (default: same as --threads)
-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft)
--draft N number of tokens to draft for speculative decoding (default: 5)
-ps, --p-split N speculative decoding split probability (default: 0.1)
-lcs, --lookup-cache-static FNAME
path to static lookup cache to use for lookup decoding (not updated by generation)
-lcd, --lookup-cache-dynamic FNAME
path to dynamic lookup cache to use for lookup decoding (updated by generation)
-c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
-n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
-b, --batch-size N logical maximum batch size (default: 2048)
-ub, --ubatch-size N physical maximum batch size (default: 512)
--keep N number of tokens to keep from the initial prompt (default: 0, -1 = all)
--chunks N max number of chunks to process (default: -1, -1 = all)
-fa, --flash-attn enable Flash Attention (default: disabled)
-p, --prompt PROMPT prompt to start generation with
in conversation mode, this will be used as system prompt
(default: '')
-f, --file FNAME a file containing the prompt (default: none)
--in-file FNAME an input file (repeat to specify multiple files)
-bf, --binary-file FNAME binary file containing the prompt (default: none)
-e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
--no-escape do not process escape sequences
-ptc, --print-token-count N print token count every N tokens (default: -1)
--prompt-cache FNAME file to cache prompt state for faster startup (default: none)
--prompt-cache-all if specified, saves user input and generations to cache as well
not supported with --interactive or other interactive options
--prompt-cache-ro if specified, uses the prompt cache but does not update it
-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
can be specified more than once for multiple prompts
-sp, --special special tokens output enabled (default: false)
-cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix
if suffix/prefix are not specified, default chat template will be used
(default: false)
-i, --interactive run in interactive mode (default: false)
-if, --interactive-first run in interactive mode and wait for input right away (default: false)
-mli, --multiline-input allows you to write or paste multiple lines without ending each in '\'
--in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string
--in-prefix STRING string to prefix user inputs with (default: empty)
--in-suffix STRING string to suffix after user inputs with (default: empty)
--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
sampling:
--samplers SAMPLERS samplers that will be used for generation in the order, separated by ';'
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
--sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt)
--ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
--penalize-nl penalize newline tokens (default: false)
--temp N temperature (default: 0.8)
--top-k N top-k sampling (default: 40, 0 = disabled)
--top-p N top-p sampling (default: 0.9, 1.0 = disabled)
--min-p N min-p sampling (default: 0.1, 0.0 = disabled)
--tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
--typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
--repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
--presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
--frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
--dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled)
--dynatemp-exp N dynamic temperature exponent (default: 1.0)
--mirostat N use Mirostat sampling.
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1)
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0)
-l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
--cfg-negative-prompt PROMPT
negative prompt to use for guidance (default: '')
--cfg-negative-prompt-file FNAME
negative prompt file to use for guidance
--cfg-scale N strength of guidance (default: 1.0, 1.0 = disable)
--chat-template JINJA_TEMPLATE
set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
grammar:
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
--grammar-file FNAME file to read grammar from
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
embedding:
--pooling {none,mean,cls,last}
pooling type for embeddings, use model default if unspecified
--attention {causal,non-causal}
attention type for embeddings, use model default if unspecified
context hacking:
--rope-scaling {none,linear,yarn}
RoPE frequency scaling method, defaults to linear unless specified by the model
--rope-scale N RoPE context scaling factor, expands context by a factor of N
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
--yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0)
--yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0)
-gan, --grp-attn-n N group-attention factor (default: 1)
-gaw, --grp-attn-w N group-attention width (default: 512.0)
-dkvc, --dump-kv-cache verbose print of the KV cache
-nkvo, --no-kv-offload disable KV offload
-ctk, --cache-type-k TYPE KV cache data type for K (default: f16)
-ctv, --cache-type-v TYPE KV cache data type for V (default: f16)
perplexity:
--all-logits return logits for all tokens in the batch (default: false)
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f
--hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400)
--winogrande compute Winogrande score over random tasks from datafile supplied with -f
--winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0)
--multiple-choice compute multiple choice score over random tasks from datafile supplied with -f
--multiple-choice-tasks N
number of tasks to use when computing the multiple choice score (default: 0)
--kl-divergence computes KL-divergence to logits provided via --kl-divergence-base
--ppl-stride N stride for perplexity calculation (default: 0)
--ppl-output-type {0,1} output type for perplexity calculation (default: 0)
parallel:
-dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
-np, --parallel N number of parallel sequences to decode (default: 1)
-ns, --sequences N number of sequences to decode (default: 1)
-cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)
multi-modality:
--mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md
--image FILE path to an image file. use with multimodal models. Specify multiple times for batching
backend:
--rpc SERVERS comma separated list of RPC servers
--mlock force system to keep model in RAM rather than swapping or compressing
--no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)
--numa TYPE attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437
model:
--check-tensors check model tensor data for invalid values (default: false)
--override-kv KEY=TYPE:VALUE
advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
--lora FNAME apply LoRA adapter (implies --no-mmap)
--lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)
--lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter
--control-vector FNAME add a control vector
note: this argument can be repeated to add multiple control vectors
--control-vector-scaled FNAME SCALE
add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors
--control-vector-layer-range START END
layer range to apply the control vector(s) to, start and end inclusive
-m, --model FNAME model path (default: models/$filename with filename from --hf-file
or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
-mu, --model-url MODEL_URL model download url (default: unused)
-hfr, --hf-repo REPO Hugging Face model repository (default: unused)
-hff, --hf-file FILE Hugging Face model file (default: unused)
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable)
retrieval:
--context-file FNAME file to load context from (repeat to specify multiple files)
--chunk-size N minimum length of embedded text chunks (default: 64)
--chunk-separator STRING
separator between chunks (default: '
')
passkey:
--junk N number of times to repeat the junk text (default: 250)
--pos N position of the passkey in the junk text (default: -1)
imatrix:
-o, --output FNAME output file (default: 'imatrix.dat')
--output-frequency N output the imatrix every N iterations (default: 10)
--save-frequency N save an imatrix copy every N iterations (default: 0)
--process-output collect data for the output tensor (default: false)
--no-ppl do not compute perplexity (default: true)
--chunk N start processing the input from chunk N (default: 0)
bench:
-pps is the prompt shared across parallel sequences (default: false)
-npp n0,n1,... number of prompt tokens
-ntg n0,n1,... number of text generation tokens
-npl n0,n1,... number of parallel prompts
embedding:
--embd-normalize normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
--embd-output-format empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
--embd-separator separator of embendings (default \n) for example "<#sep#>"
server:
--host HOST ip address to listen (default: 127.0.0.1)
--port PORT port to listen (default: 8080)
--path PATH path to serve static files from (default: )
--embedding(s) enable embedding endpoint (default: disabled)
--api-key KEY API key to use for authentication (default: none)
--api-key-file FNAME path to file containing API keys (default: none)
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
--timeout N server read/write timeout in seconds (default: 600)
--threads-http N number of threads used to process HTTP requests (default: -1)
--system-prompt-file FNAME
set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
--log-format {text,json}
log output format: json or text (default: json)
--metrics enable prometheus compatible metrics endpoint (default: disabled)
--no-slots disables slots monitoring endpoint (default: enabled)
--slot-save-path PATH path to save slot kv cache (default: disabled)
--chat-template JINJA_TEMPLATE
set custom jinja chat template (default: template taken from model's metadata)
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
-sps, --slot-prompt-similarity SIMILARITY
how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
logging:
--simple-io use basic IO for better compatibility in subprocesses and limited consoles
-ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset)
--log-test Run simple logging test
--log-disable Disable trace logs
--log-enable Enable trace logs
--log-file FNAME Specify a log filename (without extension)
--log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
--log-append Don't truncate the old log file.
cvector:
-o, --output FNAME output file (default: 'control_vector.gguf')
--positive-file FNAME positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt')
--negative-file FNAME negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt')
--pca-batch N batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100)
--pca-iter N number of iterations used for PCA (default: 1000)
--method {pca,mean} dimensionality reduction method to be used (default: pca)
```
**If compiled with `LLAMA_SERVER_SSL=ON`**
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate
## Build
@ -232,7 +444,7 @@ node index.js
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

View file

@ -21,7 +21,7 @@ let generation_settings = null;
//
export async function* llama(prompt, params = {}, config = {}) {
let controller = config.controller;
const api_url = config.api_url || "";
const api_url = config.api_url?.replace(/\/+$/, '') || "";
if (!controller) {
controller = new AbortController();
@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => {
// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async (config = {}) => {
if (!generation_settings) {
const api_url = config.api_url || "";
const api_url = config.api_url?.replace(/\/+$/, '') || "";
const props = await fetch(`${api_url}/props`).then(r => r.json());
generation_settings = props.default_generation_settings;
}

View file

@ -14,10 +14,10 @@
<script type="module">
import {
html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
} from '/index.js';
} from './index.js';
import { llama } from '/completion.js';
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
import { llama } from './completion.js';
import { SchemaConverter } from './json-schema-to-grammar.mjs';
import { promptFormats } from './prompt-formats.js';
import { systemPrompts } from './system-prompts.js'; // multilingual is wip
let selected_image = false;
@ -225,7 +225,7 @@
throw new Error("already running");
}
controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
const data = chunk.data;
if (data.stop) {
while (

View file

@ -1,5 +1,4 @@
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
@ -132,12 +131,20 @@
align-items: stretch;
}
.right {
.message-controls {
display: flex;
flex-direction: row;
gap: 0.5em;
justify-content: flex-end;
}
.message-controls > div:nth-child(2) {
display: flex;
flex-direction: column;
gap: 0.5em;
}
.message-controls > div:nth-child(2) > div {
display: flex;
margin-left: auto;
gap: 0.5em;
}
fieldset {
border: none;
@ -276,6 +283,7 @@
import { llama } from './completion.js';
import { SchemaConverter } from './json-schema-to-grammar.mjs';
let selected_image = false;
var slot_id = -1;
@ -447,6 +455,9 @@
/* END: Support for storing prompt templates and parameters in browsers LocalStorage */
const tts = window.speechSynthesis;
const ttsVoice = signal(null)
const llamaStats = signal(null)
const controller = signal(null)
@ -479,7 +490,7 @@
throw new Error("already running");
}
controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
const data = chunk.data;
if (data.stop) {
@ -596,8 +607,51 @@
});
}
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
function MessageInput() {
const message = useSignal("")
const message = useSignal("");
const talkActive = useSignal(false);
const sendOnTalk = useSignal(false);
const talkStop = (e) => {
if (e) e.preventDefault();
talkActive.value = false;
talkRecognition?.stop();
}
const talk = (e) => {
e.preventDefault();
if (talkRecognition)
talkRecognition.start();
else
alert("Speech recognition is not supported by this browser.");
}
if(talkRecognition) {
talkRecognition.onstart = () => {
talkActive.value = true;
}
talkRecognition.onresult = (e) => {
if (event.results.length > 0) {
message.value = event.results[0][0].transcript;
if (sendOnTalk.value) {
submit(e);
}
}
}
talkRecognition.onspeechend = () => {
talkStop();
}
}
const ttsVoices = useSignal(tts?.getVoices() || []);
const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
if (tts) {
tts.onvoiceschanged = () => {
ttsVoices.value = tts.getVoices();
}
}
const submit = (e) => {
stop(e);
@ -624,12 +678,46 @@
value="${message}"
/>
</div>
<div class="right">
<button type="submit" disabled=${generating.value}>Send</button>
<button onclick=${uploadImage}>Upload Image</button>
<div class="message-controls">
<div> </div>
<div>
<div>
<button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
<button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
<button onclick=${reset}>Reset</button>
</div>
<div>
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
e.preventDefault();
alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
`(TTS and speech recognition are not provided by llama.cpp)\n` +
`Note: STT requires HTTPS to work.`);
}}>[?]</a>
<button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
<div>
<input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
<label for="send-on-talk" style="line-height: initial;">Send after talking</label>
</div>
</div>
<div>
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
e.preventDefault();
alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
}}>[?]</a>
<label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
<select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
<option value="" selected="${!ttsVoice.value}">None</option>
${[
...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
...ttsVoices.value.filter(v => !v.default),
].map(
v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
)}
</select>
</div>
</div>
</div>
</form>
`
}
@ -659,26 +747,86 @@
}
}, [messages])
const ttsChatLineActiveIx = useSignal(undefined);
const ttsChatLine = (e, ix, msg) => {
if (e) e.preventDefault();
if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
const ttsVoices = tts.getVoices();
const voice = ttsVoices.find(v => v.name === ttsVoice.value);
if (!voice) return;
if (ttsChatLineActiveIx.value !== undefined) {
tts.cancel();
if (ttsChatLineActiveIx.value === ix) {
ttsChatLineActiveIx.value = undefined;
return;
}
}
ttsChatLineActiveIx.value = ix;
let ttsUtter = new SpeechSynthesisUtterance(msg);
ttsUtter.voice = voice;
ttsUtter.onend = e => {
ttsChatLineActiveIx.value = undefined;
};
tts.speak(ttsUtter);
}
const isCompletionMode = session.value.type === 'completion'
// Try play the last bot message
const lastCharChatLinesIxs = useSignal([]);
const lastCharChatLinesIxsOld = useSignal([]);
useEffect(() => {
if (
!isCompletionMode
&& lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
&& !generating.value
) {
const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
if (ix !== undefined) {
const msg = messages[ix];
ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
}
lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
}
}, [generating.value]);
const chatLine = ([user, data], index) => {
let message
const isArrayMessage = Array.isArray(data)
if (params.value.n_probs > 0 && isArrayMessage) {
message = html`<${Probabilities} data=${data} />`
} else {
const isArrayMessage = Array.isArray(data);
const text = isArrayMessage ?
data.map(msg => msg.content).join('') :
data;
if (params.value.n_probs > 0 && isArrayMessage) {
message = html`<${Probabilities} data=${data} />`
} else {
message = isCompletionMode ?
text :
html`<${Markdownish} text=${template(text)} />`
}
const fromBot = user && user === '{{char}}';
if (fromBot && !lastCharChatLinesIxs.value.includes(index))
lastCharChatLinesIxs.value.push(index);
if (user) {
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
return html`
<div>
<p key=${index}><strong>${template(user)}:</strong> ${message}</p>
${
fromBot && ttsVoice.value
&& html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
}
</div>
`;
} else {
return isCompletionMode ?
html`<span key=${index}>${message}</span>` :
html`<p key=${index}>${message}</p>`
html`<div><p key=${index}>${message}</p></div>`
}
};

View file

@ -737,6 +737,8 @@ struct server_context {
slot.ga_n = ga_n;
slot.ga_w = ga_w;
slot.sparams = params.sparams;
slot.reset();
slots.push_back(slot);
@ -884,7 +886,8 @@ struct server_context {
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
slot_params default_params;
llama_sampling_params default_sparams;
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
llama_sampling_params default_sparams = params.sparams;
auto & data = task.data;
if (data.count("__oaicompat") != 0) {
@ -1179,7 +1182,7 @@ struct server_context {
bool process_token(completion_token_output & result, server_slot & slot) {
// remember which tokens were sampled - used for repetition penalties during sampling
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
slot.sampled = result.tok;
// search stop word and delete it
@ -2002,6 +2005,11 @@ struct server_context {
int32_t n_batch = llama_n_batch(ctx);
int32_t n_ubatch = llama_n_ubatch(ctx);
// track if this is an embedding or non-embedding batch
// if we've added sampled tokens above, we are in non-embedding mode
// -1: none, 0: non-embedding, 1: embedding
int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
// next, batch any pending prompts without exceeding n_batch
if (params.cont_batching || batch.n_tokens == 0) {
for (auto & slot : slots) {
@ -2172,6 +2180,14 @@ struct server_context {
}
}
// check that we are in the right batch_type, if not defer the slot
bool slot_type = slot.embedding ? 1 : 0;
if (batch_type == -1) {
batch_type = slot_type;
} else if (batch_type != slot_type) {
continue;
}
// keep only the common part
int p0 = (int) system_tokens.size() + slot.n_past;
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
@ -2273,6 +2289,9 @@ struct server_context {
{"n_tokens", batch.n_tokens},
});
// make sure we're in the right embedding mode
llama_set_embeddings(ctx, batch_type == 1);
// process the created batch of tokens
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
@ -2987,6 +3006,11 @@ int main(int argc, char ** argv) {
};
const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = json::parse(req.body);
@ -3082,6 +3106,11 @@ int main(int argc, char ** argv) {
};
const auto handle_chat_completions = [&ctx_server, &params, &res_error](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
@ -3154,6 +3183,11 @@ int main(int argc, char ** argv) {
};
const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params.embedding) {
res_error(res, format_error_response("This server does not support infill. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = json::parse(req.body);
@ -3240,13 +3274,8 @@ int main(int argc, char ** argv) {
return res.set_content(data.dump(), "application/json; charset=utf-8");
};
const auto handle_embeddings = [&params, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
if (!params.embedding) {
res.status = 501;
res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8");
return;
}
const json body = json::parse(req.body);
bool is_openai = false;

View file

@ -122,8 +122,26 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
for (size_t i = 0; i < messages.size(); ++i) {
const auto & curr_msg = messages[i];
std::string role = json_value(curr_msg, "role", std::string(""));
std::string content = json_value(curr_msg, "content", std::string(""));
std::string content;
if (curr_msg.contains("content")) {
if (curr_msg["content"].is_string()) {
content = curr_msg["content"].get<std::string>();
} else if (curr_msg["content"].is_array()) {
for (const auto & part : curr_msg["content"]) {
if (part.contains("text")) {
content += "\n" + part["text"].get<std::string>();
}
}
} else {
throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
}
} else {
throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
}
chat.push_back({role, content});
}

View file

@ -29,6 +29,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
fprintf(stream, " --stdin read prompt from standard input.\n");
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
fprintf(stream, " --show-count print the total number of tokens.\n");
}
@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
// variables where to put any arguments we see.
bool printing_ids = false;
bool no_bos = false;
bool no_parse_special = false;
bool disable_logging = false;
bool show_token_count = false;
const char * model_path = NULL;
@ -229,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
else if (arg == "--no-bos") {
no_bos = true;
}
else if (arg == "--no-parse-special") {
no_parse_special = true;
}
else if (arg == "-p" || arg == "--prompt") {
if (prompt_set) {
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@ -359,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
const bool model_wants_add_bos = llama_should_add_bos_token(model);
const bool add_bos = model_wants_add_bos && !no_bos;
const bool parse_special = !no_parse_special;
std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, add_bos, true);
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
if (printing_ids) {
printf("[");

View file

@ -1,5 +0,0 @@
set(TARGET llama-train-text-from-scratch)
add_executable(${TARGET} train-text-from-scratch.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,27 +0,0 @@
# train-text-from-scratch
Basic usage instructions:
```bash
# get training data
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
# train
./bin/llama-train-text-from-scratch \
--vocab-model ../models/ggml-vocab-llama.gguf \
--ctx 64 --embd 256 --head 8 --layer 16 \
--checkpoint-in chk-shakespeare-256x16-LATEST.gguf \
--checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
--model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
--train-data "shakespeare.txt" \
-t 6 -b 16 --seed 1 --adam-iter 256 \
--no-checkpointing
# predict
./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
```
Output files will be saved every N iterations (config with `--save-every N`).
The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
To train GGUF models just pass them to `--checkpoint-in FN`.

View file

@ -1,499 +0,0 @@
#!/usr/bin/env python3
# train-text-from-scratch checkpoint --> gguf conversion
import argparse
import os
import struct
import sys
import numpy as np
from pathlib import Path
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
import gguf
# gguf constants
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
LLM_KV_TRAINING_TYPE = "training.type"
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
class Tensor:
def __init__(self, dtype='f', ne=None):
if ne is None:
ne = []
self.dtype = dtype
self.ne = ne
self.nbytes = 0
if self.dtype == 'f':
if len(self.ne) == 0:
self.nbytes = 0
else:
self.nbytes = int(np.prod(self.ne)) * 4
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
def load(self, data, offset):
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
assert(nd == len(self.ne))
ne = []
for d in range(nd):
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
ne.append(n)
assert(tuple(ne) == tuple(self.ne))
if self.dtype == 'f':
assert(dtype == 0)
else:
raise ValueError(f"Unhandled data type '{self.dtype}'")
self.name = bytes(data[offset:offset+namelen]); offset += namelen
# 32-byte alignment
offset += (0 - offset) & 31
self.data = data[offset:offset+self.nbytes]
offset += self.nbytes
return offset
def max_storage_size(self):
result = 0
result += 4 # nd
result += 4 # namelen
result += 4 # dtype
result += len(self.ne)*8 # ne
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
result += 31 # 32-byte alignment
result += self.nbytes
return result
def save_gguf(self, gguf_writer, name):
gguf_writer.add_tensor(
name=name,
tensor=self.data,
raw_shape=np.array(list(reversed(self.ne))),
raw_dtype=gguf.GGMLQuantizationType.F32)
class OptimizationParamsV0:
def __init__(self):
pass
def load(self, data, offset):
self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
class OptimizationContext:
def __init__(self):
pass
def load(self, data, offset):
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
offset += 4
if self.version == 0:
params = OptimizationParamsV0()
offset = params.load(data, offset)
self.past = params.past
self.lbfgs_m = params.lbfgs_m
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
self.type = params.type
self.adam_m = Tensor('f', [self.nx])
self.adam_v = Tensor('f', [self.nx])
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_x = Tensor('f', [self.nx])
self.lbfgs_xp = Tensor('f', [self.nx])
self.lbfgs_g = Tensor('f', [self.nx])
self.lbfgs_gp = Tensor('f', [self.nx])
self.lbfgs_d = Tensor('f', [self.nx])
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
if self.type == 0:
# these tensors are stored, but we don't need their data
x = Tensor('f', [self.nx])
g = Tensor('f', [self.nx])
g2 = Tensor('f', [self.nx])
mh = Tensor('f', [self.nx])
vh = Tensor('f', [self.nx])
offset = x.load(data, offset)
offset = g.load(data, offset)
offset = g2.load(data, offset)
offset = self.adam_m.load(data, offset)
offset = self.adam_v.load(data, offset)
offset = mh.load(data, offset)
offset = vh.load(data, offset)
offset = self.adam_pf.load(data, offset)
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
elif self.type == 1:
offset = self.lbfgs_x.load(data, offset)
offset = self.lbfgs_xp.load(data, offset)
offset = self.lbfgs_g.load(data, offset)
offset = self.lbfgs_gp.load(data, offset)
offset = self.lbfgs_d.load(data, offset)
offset = self.lbfgs_pf.load(data, offset)
offset = self.lbfgs_lmal.load(data, offset)
offset = self.lbfgs_lmys.load(data, offset)
offset = self.lbfgs_lms.load(data, offset)
offset = self.lbfgs_lmy.load(data, offset)
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
else:
raise ValueError('Unknown optimizer type')
elif self.version == 1:
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
self.adam_m = Tensor('f', [self.nx])
self.adam_v = Tensor('f', [self.nx])
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_x = Tensor('f', [self.nx])
self.lbfgs_xp = Tensor('f', [self.nx])
self.lbfgs_g = Tensor('f', [self.nx])
self.lbfgs_gp = Tensor('f', [self.nx])
self.lbfgs_d = Tensor('f', [self.nx])
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
# forgot to save type in version 1:
# guess self.type from number of remaining bytes
size_type_0 = 12 + sum([t.max_storage_size() for t in
[self.adam_m, self.adam_v]
+([self.adam_pf] if (self.past > 0) else [])])
size_type_1 = 24 + sum([t.max_storage_size() for t in
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
self.lbfgs_lmal, self.lbfgs_lmys,
self.lbfgs_lms, self.lbfgs_lmy]
+([self.lbfgs_pf] if (self.past > 0) else [])])
# due to alignment padding the size might not by exact
# but the difference in size for both types is significant,
# so we can just use whichever is closest
remaining = len(data) - offset
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
self.type = 0
else:
self.type = 1
if self.type == 0:
offset = self.adam_m.load(data, offset)
offset = self.adam_v.load(data, offset)
offset = self.adam_pf.load(data,offset)
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
elif self.type == 1:
offset = self.lbfgs_x.load(data, offset)
offset = self.lbfgs_xp.load(data, offset)
offset = self.lbfgs_g.load(data, offset)
offset = self.lbfgs_gp.load(data, offset)
offset = self.lbfgs_d.load(data, offset)
offset = self.lbfgs_pf.load(data, offset)
offset = self.lbfgs_lmal.load(data, offset)
offset = self.lbfgs_lmys.load(data, offset)
offset = self.lbfgs_lms.load(data, offset)
offset = self.lbfgs_lmy.load(data, offset)
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
else:
raise ValueError('Invalid version of checkpoint file')
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
if self.type == 0:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
if self.past > 0:
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
elif self.type == 1:
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
if self.past > 0:
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
else:
raise ValueError('Unknown optimizer type')
class ModelParams:
def __init__(self):
pass
def load(self, data, offset):
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
return offset
def get_n_ff(self):
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
def save_gguf(self, gguf_writer):
# self.n_vocab not saved
gguf_writer.add_embedding_length(self.n_embd)
gguf_writer.add_head_count(self.n_head)
gguf_writer.add_block_count(self.n_layer)
gguf_writer.add_rope_dimension_count(self.n_rot)
gguf_writer.add_feed_forward_length(self.get_n_ff())
def tensor_name(key, bid=None):
return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
class Layer:
def __init__(self, params, bid):
self.bid = bid
self.att_norm = Tensor('f', [params.n_embd])
self.wq = Tensor('f', [params.n_embd, params.n_embd])
self.wk = Tensor('f', [params.n_embd, params.n_embd])
self.wv = Tensor('f', [params.n_embd, params.n_embd])
self.wo = Tensor('f', [params.n_embd, params.n_embd])
self.ffn_norm = Tensor('f', [params.n_embd])
self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
def load(self, data, offset):
offset = self.att_norm.load(data, offset)
offset = self.wq.load(data, offset)
offset = self.wk.load(data, offset)
offset = self.wv.load(data, offset)
offset = self.wo.load(data, offset)
offset = self.ffn_norm.load(data, offset)
offset = self.w1.load(data, offset)
offset = self.w2.load(data, offset)
offset = self.w3.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
class Model:
def __init__(self):
self.params = ModelParams()
self.layers = []
def load(self, data, offset):
offset = self.params.load(data, offset)
self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
self.norm = Tensor('f', [self.params.n_embd])
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
offset = self.tok_embd.load(data, offset)
offset = self.norm.load(data, offset)
offset = self.output.load(data, offset)
self.layers.clear()
for bid in range(self.params.n_layer):
layer = Layer(self.params, bid)
offset = layer.load(data, offset)
self.layers.append(layer)
return offset
def save_gguf(self, gguf_writer):
self.params.save_gguf(gguf_writer)
self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
for layer in self.layers:
layer.save_gguf(gguf_writer)
class Checkpoint:
def __init__(self):
self.model = Model()
self.opt_ctx = OptimizationContext()
def load(self, data, offset):
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
if magic != b'ggcp':
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
if self.version != 0:
raise ValueError('Invalid version of checkpoint file')
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
offset = self.model.load(data, offset)
offset = self.opt_ctx.load(data, offset)
return offset
def save_gguf(self, gguf_writer):
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
gguf_writer.add_layer_norm_rms_eps(1e-5)
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
self.model.save_gguf(gguf_writer)
self.opt_ctx.save_gguf(gguf_writer)
def handle_args():
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
return parser.parse_args()
def main():
cfg = handle_args()
data = np.memmap(cfg.input, mode = 'r')
chk = Checkpoint()
offset = 0
offset = chk.load(data, offset)
# we should have read all available data
assert(offset == len(data))
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
chk.save_gguf(gguf_writer)
print(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load diff

20
flake.lock generated
View file

@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1717285511,
"narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=",
"lastModified": 1719994518,
"narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8",
"rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7",
"type": "github"
},
"original": {
@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1719506693,
"narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
"lastModified": 1721379653,
"narHash": "sha256-8MUgifkJ7lkZs3u99UDZMB4kbOxvMEXQZ31FO3SopZ0=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
"rev": "1d9c2c9b3e71b9ee663d11c5d298727dace8d374",
"type": "github"
},
"original": {
@ -36,14 +36,14 @@
},
"nixpkgs-lib": {
"locked": {
"lastModified": 1717284937,
"narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=",
"lastModified": 1719876945,
"narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=",
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
"url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
},
"original": {
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
"url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
}
},
"root": {

2
ggml/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
src/ggml-vulkan-shaders.hpp
src/ggml-vulkan-shaders.cpp

View file

@ -104,7 +104,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF)
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
@ -194,13 +194,19 @@ endif ()
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
# all public headers
set(GGML_PUBLIC_HEADERS
include/ggml.h
include/ggml-alloc.h
include/ggml-backend.h
"${GGML_HEADERS_CUDA}"
"${GGML_HEADERS_METAL}"
"${GGML_HEADERS_EXTRA}")
include/ggml-blas.h
include/ggml-cuda.h
include/ggml.h
include/ggml-kompute.h
include/ggml-metal.h
include/ggml-rpc.h
include/ggml-sycl.h
include/ggml-vulkan.h)
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL)

View file

@ -1,220 +0,0 @@
#!/usr/bin/env python
import logging
import argparse
import asyncio
import os
from tempfile import gettempdir
logger = logging.getLogger("ggml-vk-generate-shaders")
GLSLC = "glslc"
type_names = [
"f32",
"f16",
"q4_0",
"q4_1",
"q5_0",
"q5_1",
"q8_0",
"q2_k",
"q3_k",
"q4_k",
"q5_k",
"q6_k",
]
ASYNCIO_CONCURRENCY = 64
input_dir = "vulkan-shaders"
output_dir = gettempdir()
lock = asyncio.Lock()
shader_fnames = []
async def string_to_spv(name, in_fname, defines, fp16=True):
name = f"{name}{'_fp32' if not fp16 else ''}"
out_fname = os.path.join(output_dir, f"{name}.spv")
in_path = os.path.join(input_dir, in_fname)
cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname]
cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
stdout = stdout.decode()
error = stderr.decode()
if proc.returncode:
cmd = " ".join(cmd)
logger.error(f"cannot compile {name}\n\n{cmd}\n\n{error}")
return
async with lock:
shader_fnames.append((name, out_fname))
def matmul_shaders(tasks, fp16, matmul_id):
if fp16:
load_vec = "8"
aligned_b_type_f32 = "mat2x4"
aligned_b_type_f16 = "f16mat2x4"
else:
load_vec = "4"
aligned_b_type_f32 = "vec4"
aligned_b_type_f16 = "f16vec4"
base_dict = {"FLOAT_TYPE": "float" if not fp16 else "float16_t"}
shader_name = "matmul"
if matmul_id:
base_dict["MUL_MAT_ID"] = "1"
shader_name = "matmul_id"
if fp16:
base_dict["FLOAT16"] = "1"
# Shaders with f16 B_TYPE
tasks.append(string_to_spv(f"{shader_name}_f32_f16", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f32_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f16", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
for tname in type_names:
data_a_key = f"DATA_A_{tname.upper()}"
load_vec_a = load_vec if tname in ("f32", "f16") else "2"
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32", "mul_mm.comp", base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32_aligned", "mul_mm.comp", base_dict | {data_a_key: "2", "LOAD_VEC_A": load_vec_a, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f32, "D_TYPE": "float"}, fp16))
async def main():
logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
tasks = []
base_dict = {"FLOAT_TYPE": "float"}
for fp16 in (False, True):
# MUL_MAT
matmul_shaders(tasks, fp16, False)
# MUL_MAT_ID
matmul_shaders(tasks, fp16, True)
for tname in type_names:
# mul mat vec
data_a_key = f"DATA_A_{tname.upper()}"
shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f32_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f16_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float16_t", "D_TYPE": "float"}))
tasks.append(string_to_spv(f"mul_mat_vec_id_{tname}_f32", shader, base_dict | {"MUL_MAT_ID": "1", data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
# Dequant shaders
if tname != "f16":
tasks.append(string_to_spv(f"dequant_{tname}", f"dequant_{tname}.comp", base_dict | {data_a_key: "1", "D_TYPE": "float16_t"}))
# get_rows
if not tname.endswith("_k"):
shader = "get_rows.comp" if tname in ("f32", "f16") else "get_rows_quant.comp"
if tname == "f16":
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
else:
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv(f"get_rows_{tname}_f32", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
# Norms
tasks.append(string_to_spv("norm_f32", "norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rms_norm_f32", "rms_norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f32", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f16", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("cpy_f16_f16", "copy.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
tasks.append(string_to_spv("add_f32", "add.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}))
tasks.append(string_to_spv("mul_f32", "mul.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("div_f32", "div.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("scale_f32", "scale.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("sqr_f32", "square.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("clamp_f32", "clamp.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("gelu_f32", "gelu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("silu_f32", "silu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("relu_f32", "relu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32_f16", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f32", "rope_norm.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f16", "rope_norm.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("rope_neox_f32", "rope_neox.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_neox_f16", "rope_neox.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("argsort_f32", "argsort.comp", {"A_TYPE": "float"}))
tasks.append(string_to_spv("sum_rows_f32", "sum_rows.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
# Helper to decorate tasks with semaphore acquisition.
async def withSemaphore(sem, task):
async with sem:
return await task
# Run tasks concurrently guarded by a concurrency limit.
sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
with open("ggml-vulkan-shaders.hpp", "w") as f:
f.write("#include <cstdint>\n\n")
for name, path in sorted(shader_fnames):
with open(path, "rb") as spv:
counter = 0
newline_counter = 0
f.write(f"unsigned char {name}_data[] = {{\n")
for val in spv.read():
f.write(f"0x{val:02x},")
newline_counter += 1
counter += 1
if newline_counter >= 12:
newline_counter = 0
f.write("\n")
f.write("\n};\n")
f.write(f"const uint64_t {name}_len = {counter};\n\n")
os.remove(path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
parser.add_argument("--glslc", help="Path to glslc")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if args.glslc:
GLSLC = args.glslc
asyncio.run(main())

View file

@ -29,6 +29,7 @@ extern "C" {
enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
};
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
@ -42,6 +43,7 @@ extern "C" {
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);

125
ggml/include/ggml-cann.h Normal file
View file

@ -0,0 +1,125 @@
/*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#pragma once
#include "ggml-backend.h"
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Maximum number of CANN devices supported.
*/
#define GGML_CANN_MAX_DEVICES 16
/**
* @brief Initializes the CANN backend for a specified device.
*
* This function initializes the CANN backend for the given device.
* It verifies the device index, allocates a context, and creates a backend
* instance.
*
* @param device The index of the device to initialize.
* @return A pointer to the initialized backend instance, or nullptr on failure.
*/
GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
/**
* @brief Checks if a given backend is a CANN backend.
*
* This function verifies if the provided backend is a CANN backend by comparing
* its GUID with the CANN backend's GUID.
*
* @param backend The backend instance to check.
* @return True if the backend is a CANN backend, false otherwise.
*/
GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
/**
* @brief Retrieves the CANN buffer type for a specified device.
*
* This function initializes and returns the buffer type interface associated
* with the given device. It ensures thread-safe access using a mutex.
*
* @param device The device index for which to retrieve the buffer type.
* @return A pointer to the buffer type interface for the specified device, or
* nullptr if the device index is out of range.
*/
GGML_API GGML_CALL ggml_backend_buffer_type_t
ggml_backend_cann_buffer_type(int32_t device);
/**
* @brief Retrieves the number of CANN devices available.
*
* This function returns the number of CANN devices available based on
* information obtained from `ggml_cann_info()`.
*
* @return The number of CANN devices available.
*/
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
/**
* @brief Retrieves the description of a specific CANN device.
*
* This function sets the specified device, retrieves the SoC name,
* and writes it into the provided description buffer.
*
* @param device The device index to retrieve the description for.
* @param description Pointer to a buffer where the description will be written.
* @param description_size Size of the description buffer.
*/
GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
int32_t device, char* description, size_t description_size);
/**
* @brief Retrieves the memory information of a specific CANN device.
*
* This function sets the specified device, retrieves the free and total
* memory information of the specified type (ACL_HBM_MEM), and stores them
* in the provided pointers.
*
* @param device The device index to retrieve memory information for.
* @param free Pointer to a variable where the free memory size will be stored.
* @param total Pointer to a variable where the total memory size will be
* stored.
*/
GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device,
size_t* free,
size_t* total);
/**
* @brief Set the logging callback for GGML.
*
* This function sets the logging callback and user data for logging.
*
* @param log_callback The logging callback to set.
* @param user_data User data to pass to the logging callback.
*/
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
void* user_data);
#ifdef __cplusplus
}
#endif

View file

@ -383,6 +383,9 @@ extern "C" {
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_COUNT,
};
@ -424,6 +427,9 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
};
// available tensor operations:
@ -708,7 +714,7 @@ extern "C" {
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
@ -747,6 +753,8 @@ extern "C" {
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);
@ -2391,6 +2399,8 @@ extern "C" {
GGML_API int ggml_cpu_has_rpc (void);
GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void);
//
// Internal types and functions exposed for tests and benchmarks
@ -2404,20 +2414,31 @@ extern "C" {
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef struct {
const char * type_name;
int blck_size;
int64_t blck_size;
int64_t blck_size_interleave; // interleave elements in blocks
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_reference;
ggml_from_float_t from_float_ref;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
} ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

View file

@ -238,12 +238,12 @@ if (GGML_BLAS)
endif()
if (GGML_LLAMAFILE)
message(STATUS "Using ggml SGEMM")
message(STATUS "Using llamafile")
add_compile_definitions(GGML_USE_LLAMAFILE)
set(GGML_HEADERS_LLAMAFILE sgemm.h)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
endif()
if (GGML_CUDA)
@ -440,6 +440,10 @@ if (GGML_HIPBLAS)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
if (GGML_CUDA_FORCE_CUBLAS)
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
endif()
if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
@ -463,15 +467,18 @@ if (GGML_SYCL)
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
endif()
if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
if ( DEFINED ENV{ONEAPI_ROOT})
message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
elseif(SUPPORTS_SYCL)
message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
source /opt/intel/oneapi/setvars.sh")
else()
message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
endif()
#todo: AOT
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
message(STATUS "SYCL found")
#todo: AOT
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
@ -483,11 +490,9 @@ if (GGML_SYCL)
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
endif()
add_compile_options(-I./) #include DPCT
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
else()
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
@ -500,14 +505,14 @@ if (GGML_SYCL)
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
else()
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
if (GGML_SYCL_TARGET STREQUAL "INTEL")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
endif()
endif()
@ -527,14 +532,11 @@ if (GGML_RPC)
endif()
if (GGML_VULKAN)
find_package(Vulkan)
find_package(Vulkan COMPONENTS glslc REQUIRED)
if (Vulkan_FOUND)
message(STATUS "Vulkan found")
set(GGML_HEADERS_VULKAN ../include/ggml-vulkan.h)
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN)
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
@ -563,7 +565,37 @@ if (GGML_VULKAN)
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
endif()
add_subdirectory(vulkan-shaders)
set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
add_custom_command(
OUTPUT ${_ggml_vk_header}
${_ggml_vk_source}
COMMAND ${_ggml_vk_genshaders_cmd}
--glslc ${Vulkan_GLSLC_EXECUTABLE}
--input-dir ${_ggml_vk_input_dir}
--output-dir ${_ggml_vk_output_dir}
--target-hpp ${_ggml_vk_header}
--target-cpp ${_ggml_vk_source}
--no-clean
DEPENDS ${_ggml_vk_shader_deps}
COMMENT "Generate vulkan shaders"
)
set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
else()
message(WARNING "Vulkan not found")
endif()
@ -739,6 +771,74 @@ if (GGML_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
if (GGML_CANN)
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
endif()
if (CANN_INSTALL_DIR)
# Only Support Linux.
if (GGML_CANN)
if (NOT UNIX)
set(GGML_CANN OFF)
message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN")
endif()
endif()
# Supported platforms: x86-64, arm64
if (GGML_CANN)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
else()
set(GGML_CANN OFF)
message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN")
endif()
endif()
# Set header and libs
if(GGML_CANN)
set(CANN_INCLUDE_DIRS
${CANN_INSTALL_DIR}/include
${CANN_INSTALL_DIR}/include/aclnn
${CANN_INSTALL_DIR}/acllib/include
)
# TODO: find libs
link_directories(
${CANN_INSTALL_DIR}/lib64
)
add_subdirectory(ggml-cann/kernels)
list(APPEND CANN_LIBRARIES
ascendcl
nnopbase
opapi
acl_op_compiler
ascendc_kernels
)
set(GGML_HEADERS_CANN "../include/ggml-cann.h")
file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
endif()
else()
set(GGML_CANN OFF)
message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN")
endif()
if(NOT GGML_CANN)
message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.")
endif()
endif()
function(get_flags CCID CCVER)
set(C_FLAGS "")
set(CXX_FLAGS "")
@ -1153,6 +1253,8 @@ add_library(ggml
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
ggml-aarch64.c ggml-aarch64.h
)
if (EMSCRIPTEN)

2193
ggml/src/ggml-aarch64.c Normal file

File diff suppressed because it is too large Load diff

39
ggml/src/ggml-aarch64.h Normal file
View file

@ -0,0 +1,39 @@
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
#pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h"
// GGML internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// GEMV
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#ifdef __cplusplus
}
#endif

View file

@ -776,6 +776,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
}
}

View file

@ -134,6 +134,10 @@ void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backe
}
}
enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage(ggml_backend_buffer_t buffer) {
return buffer->usage;
}
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
return buffer->buft;
}
@ -394,7 +398,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
// backend registry
#define GGML_REG_MAX_BACKENDS 16
#define GGML_REG_MAX_BACKENDS 64
struct ggml_backend_reg {
char name[128];
@ -445,6 +449,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
ggml_backend_kompute_reg_devices();
#endif
#ifdef GGML_USE_CANN
extern GGML_CALL int ggml_backend_cann_reg_devices(void);
ggml_backend_cann_reg_devices();
#endif
}
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {

View file

@ -8,11 +8,12 @@
# include <Accelerate/Accelerate.h>
#elif defined(GGML_BLAS_USE_MKL)
# include <mkl.h>
#elif defined(GGML_BLAS_USE_BLIS)
# include <blis.h>
#elif defined(GGML_BLAS_USE_NVPL)
# include <nvpl_blas.h>
#else
# include <cblas.h>
# ifdef BLIS_ENABLE_CBLAS
# include <blis.h>
# endif
#endif
struct ggml_backend_blas_context {
@ -140,10 +141,14 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
openblas_set_num_threads(ctx->n_threads);
#endif
#if defined(BLIS_ENABLE_CBLAS)
#if defined(GGML_BLAS_USE_BLIS)
bli_thread_set_num_threads(ctx->n_threads);
#endif
#if defined(GGML_BLAS_USE_NVPL)
nvpl_blas_set_num_threads(ctx->n_threads);
#endif
for (int64_t i13 = 0; i13 < ne13; i13++) {
for (int64_t i12 = 0; i12 < ne12; i12++) {
const int64_t i03 = i13/r3;

2023
ggml/src/ggml-cann.cpp Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,168 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveMacros: false
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: WithoutElse
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: true
DerivePointerAlignment: true
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
SortPriority: 0
- Regex: '^<.*\.h>'
Priority: 1
SortPriority: 0
- Regex: '^<.*'
Priority: 2
SortPriority: 0
- Regex: '.*'
Priority: 3
SortPriority: 0
IncludeIsMainRegex: '([-_](test|unittest))?$'
IncludeIsMainSourceRegex: ''
IndentCaseLabels: true
IndentGotoLabels: true
IndentPPDirectives: None
IndentWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
BasedOnStyle: google
- Language: TextProto
Delimiters:
- pb
- PB
- proto
- PROTO
EnclosingFunctions:
- EqualsProto
- EquivToProto
- PARSE_PARTIAL_TEXT_PROTO
- PARSE_TEST_PROTO
- PARSE_TEXT_PROTO
- ParseTextOrDie
- ParseTextProtoOrDie
CanonicalDelimiter: ''
BasedOnStyle: google
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
Standard: Auto
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseCRLF: false
UseTab: Never
...

2579
ggml/src/ggml-cann/Doxyfile Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,198 @@
/*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "acl_tensor.h"
#include <algorithm>
#include <cstring>
aclDataType ggml_cann_type_mapping(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return ACL_FLOAT;
case GGML_TYPE_F16:
return ACL_FLOAT16;
case GGML_TYPE_I8:
return ACL_INT8;
case GGML_TYPE_I16:
return ACL_INT16;
case GGML_TYPE_I32:
return ACL_INT32;
default:
return ACL_DT_UNDEFINED;
}
return ACL_DT_UNDEFINED;
}
aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
size_t* nb, int64_t dims, aclFormat format,
size_t offset) {
// If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
// added.
int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
int64_t acl_storage_len = 0;
if (ne == nullptr) {
acl_storage_len = ggml_nbytes(tensor);
for (int i = 0; i < GGML_MAX_DIMS; i++) {
acl_ne[i] = tensor->ne[i];
// The step size of acl is in elements.
acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
}
} else {
// With bcast
for (int i = 0; i < dims; i++) {
acl_storage_len += (ne[i] - 1) * nb[i];
acl_ne[i] = ne[i];
acl_stride[i] = nb[i] / ggml_element_size(tensor);
}
}
// Reverse ne and stride.
int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
std::reverse(acl_ne, acl_ne + final_dims);
std::reverse(acl_stride, acl_stride + final_dims);
aclTensor* acl_tensor = aclCreateTensor(
acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
tensor->data);
return acl_tensor;
}
bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
return true;
}
}
return false;
}
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
size_t type_size, int64_t* ne, size_t* nb,
int64_t dims, aclFormat format,
size_t offset) {
int64_t tmp_ne[GGML_MAX_DIMS * 2];
int64_t tmp_stride[GGML_MAX_DIMS * 2];
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
for (int i = 0; i < dims; i++) {
tmp_stride[i] = nb[i] / type_size;
}
std::reverse(tmp_ne, tmp_ne + dims);
std::reverse(tmp_stride, tmp_stride + dims);
int64_t acl_storage_len = 0;
for (int i = 0; i < dims; i++) {
acl_storage_len += (ne[i] - 1) * nb[i];
}
aclTensor* acl_tensor =
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
format, &acl_storage_len, 1, data_ptr);
return acl_tensor;
}
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
const ggml_tensor* src1,
int64_t* bcast_src0_ne,
int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
size_t* bcast_src1_nb) {
GGML_ASSERT(ggml_can_repeat(src1, src0));
int bcast_dim_cnt = 0;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
int64_t nr = src0->ne[i] / src1->ne[i];
bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
bcast_dim_cnt++;
if (nr != 1) {
// Need to add an extra dim.
bcast_src0_ne[bcast_dim_cnt] = nr;
bcast_src1_ne[bcast_dim_cnt] = 1;
bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
bcast_src0_ne[bcast_dim_cnt - 1];
bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
bcast_src1_ne[bcast_dim_cnt - 1];
bcast_dim_cnt++;
}
}
return bcast_dim_cnt;
}
int64_t ggml_cann_get_mulmat_bcast_shape(
const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
// input and dst shoule in same shape, except first two dims.
GGML_ASSERT(input_ne[2] == dst_ne[2]);
GGML_ASSERT(input_ne[3] == dst_ne[3]);
int bcast_dim_cnt = 0;
// For mul_mat, a dimension needs to be added before the dimension that
// weight needs to be expanded to satisfy the bcast rule of matrix
// multiplication.
for (int i = 0; i < GGML_MAX_DIMS; i++) {
int64_t nr = input_ne[i] / weight_ne[i];
// Do not use bcast in the first two dimensions because we only support
// the bcast batch dimension. Just copy them.
if (i < 2 || nr == 1) {
bcast_input_ne[bcast_dim_cnt] = input_ne[i];
bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
bcast_input_nb[bcast_dim_cnt] = input_nb[i];
bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
bcast_dim_cnt++;
} else {
// Need to add an extra dim.
bcast_input_ne[bcast_dim_cnt] = nr;
bcast_dst_ne[bcast_dim_cnt] = nr;
bcast_weight_ne[bcast_dim_cnt] = 1;
bcast_input_nb[bcast_dim_cnt] = input_nb[i];
bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
bcast_dim_cnt++;
bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
bcast_input_ne[bcast_dim_cnt - 1];
bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
bcast_dst_ne[bcast_dim_cnt - 1];
bcast_weight_nb[bcast_dim_cnt] =
bcast_weight_nb[bcast_dim_cnt - 1] *
bcast_weight_ne[bcast_dim_cnt - 1];
bcast_dim_cnt++;
}
}
return bcast_dim_cnt;
}

View file

@ -0,0 +1,230 @@
/*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef CANN_ACL_TENSOR_H
#define CANN_ACL_TENSOR_H
#include <aclnn/aclnn_base.h>
#include "common.h"
/**
* @brief Maps a ggml_type to its corresponding aclDataType.
*
* @details This function takes a ggml_type as input and returns the corresponding
* aclDataType. It supports mapping for various ggml_types. If the input type
* does not match any of the predefined ggml_types, the function returns
* ACL_DT_UNDEFINED.
*
* @param type The ggml_type to be mapped.
* @return The corresponding aclDataType. If the input type is not recognized,
* ACL_DT_UNDEFINED is returned.
*/
aclDataType ggml_cann_type_mapping(ggml_type type);
/**
* @brief Creates an ACL tensor from a ggml_tensor with optional shape.
*
* @details This function creates an ACL tensor based on the properties of the
* provided ggml_tensor. It supports customer shape by adjusting dimensions
* and strides accordingly. If customer shape is applied, additional
* dimensions and strides are calculated based on the provided parameters.
*
* @param tensor Pointer to the ggml_tensor to be converted to ACL tensor.
* @param ne Pointer to an array containing dimensions. Defaults to nullptr
* if no customer shape is applied.
* @param nb Pointer to an array containing strides. Defaults to nullptr
* if no customer shape is applied.
* @param dims Number of dimensions in the tensor. Defaults to 0 if no customer
* shape is applied.
* @param format ACL tensor format. Defaults to ACL_FORMAT_ND.
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
* @return Pointer to the created ACL tensor.
*/
aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
size_t* nb = nullptr, int64_t dims = 0,
aclFormat format = ACL_FORMAT_ND,
size_t offset = 0);
/**
* @brief Creates an ACL tensor from provided parameters.
*
* @details This function creates an ACL tensor using the provided data pointer,
* data type, dimensions, strides, format, offset, and additional parameters.
* It calculates necessary dimensions and strides based on the provided ne and nb
* arrays, adjusting them for the ACL tensor creation. The ACL storage length
* is also calculated based on the provided dimensions and strides.
*
* @param data_ptr Pointer to the data buffer for the ACL tensor.
* @param dtype ACL data type of the tensor.
* @param type_size Size of each element in the tensor data buffer.
* @param ne Pointer to an array containing tensor dimensions.
* @param nb Pointer to an array containing tensor strides.
* @param dims Number of dimensions of the tensor.
* @param format ACL tensor format. Defaults to ACL_FORMAT_ND.
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
* @return Pointer to the created ACL tensor.
*/
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
size_t type_size, int64_t* ne, size_t* nb,
int64_t dims, aclFormat format = ACL_FORMAT_ND,
size_t offset = 0);
/**
* @brief Checks if tensors require broadcasting based on their shapes.
*
* @details This function determines if two ggml_tensors need to be broadcasted for
* element-wise operations. Broadcasting is necessary if the shapes of the
* tensors are not identical and no dimension in either tensor equals 1.
*
* @param t0 Pointer to the first ggml_tensor.
* @param t1 Pointer to the second ggml_tensor.
* @return True if broadcasting is needed, False otherwise.
*
* @remarks This function iterates over the dimensions of t0 and t1. It checks if each
* dimension in t1 differs from t0's corresponding dimension and is not equal
* to 1. If such a dimension is found, broadcasting is required to align t1
* with t0 for element-wise operations.
*/
bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
/**
* @brief Computes broadcast shapes and strides for two ggml_tensors.
*
* @details This function calculates the broadcast shapes and strides for two ggml_tensors,
* following the broadcasting rules similar to numpy. It adjusts dimensions and
* strides to ensure compatibility for element-wise operations where one tensor
* can be broadcasted to match the shape of another tensor.
*
* @param src0 Pointer to the first ggml_tensor.
* @param src1 Pointer to the second ggml_tensor.
* @param bcast_ne_src0 Output array to store broadcasted dimensions for src0.
* @param bcast_ne_src1 Output array to store broadcasted dimensions for src1.
* @param bcast_nb_src0 Output array to store broadcasted strides for src0.
* @param bcast_nb_src1 Output array to store broadcasted strides for src1.
* @return Number of dimensions in the broadcasted shape.
*
* @pre ggml_can_repeat(src1, src0) must return true, indicating src1 can be broadcasted
* to match src0.
*
* @remarks This function iterates over the dimensions of src0 and src1, calculating the
* necessary broadcast dimensions and strides. If a dimension requires broadcasting
* (i.e., its size in src1 is smaller than in src0), an additional dimension is
* added with size calculated to match src0's dimension. This adjustment ensures
* that src1 can be element-wise broadcasted to src0's shape.
*
* How it works:
*
* if dim0 has padding.
* a -> (2, 2) padding = 2
* a: [[1, 2, *, *]
* [2, 3, *, *]]
* nb = (8, 4, 2)
*
* if a should bcast with b -> (2, 4)
* b' -> (2, 2, 2)
* b : [[1, 2, 3, 4, *, *]
* [5, 6, 7, 8, *, *]]
* nb = (12, 6, 1)
*
* after bcast:
* a' -> (2, 1, 2)
* a': [[[1, 2], *, *]
* [[2, 3], *, *]]
* nb = (8, 4, 2, 1)
*
* b' : [[[1, 2], [3, 4], *, *]
* [[5, 6], [7, 8], *, *]]
* nb = (12, 6, 2, 1)
* \endcode
*
* dim1 in a inserted dim, should add nb for dim1,
* and all other nb moves to next in order.
*/
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
size_t* bcast_nb_src0, size_t* bcast_nb_src1);
// Bcast macro to avoid duplicate code.
#define BCAST_SHAPE(src0, src1) \
int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2]; \
int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2]; \
size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2]; \
size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2]; \
int64_t bcast_dims = ggml_cann_get_bcast_shape( \
src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
bcast_##src1##_nb);
#define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
/**
* @brief Calculates broadcast shapes for matrix multiplication.
*
* @details This function computes the broadcast shapes required for matrix multiplication
* based on the input, weight, and destination tensor shapes. It ensures that the
* dimensions of weight tensors are expanded appropriately to satisfy matrix
* multiplication broadcast rules.
*
* @param input_ne Array containing the dimensions of the input tensor.
* @param weight_ne Array containing the dimensions of the weight tensor.
* @param dst_ne Array containing the dimensions of the destination tensor.
* @param input_nb Array containing the strides of the input tensor.
* @param weight_nb Array containing the strides of the weight tensor.
* @param dst_nb Array containing the strides of the destination tensor.
* @param bcast_input_ne Output array for broadcasted input tensor dimensions.
* @param bcast_weight_ne Output array for broadcasted weight tensor dimensions.
* @param bcast_dst_ne Output array for broadcasted destination tensor dimensions.
* @param bcast_input_nb Output array for broadcasted input tensor strides.
* @param bcast_weight_nb Output array for broadcasted weight tensor strides.
* @param bcast_dst_nb Output array for broadcasted destination tensor strides.
* @return The number of dimensions in the broadcasted tensors.
*
* @remarks This function iterates over the tensor dimensions and calculates the broadcast
* shapes needed for matrix multiplication. It ensures that dimensions where
* weight tensor requires expansion are appropriately handled to conform with
* broadcasting rules.
* @note compare with ggml_cann_get_bcast_shape, mul_mat broadcast need add this new dim
* before cast dim.
* @sa ggml_cann_get_bcast_shape
*/
int64_t ggml_cann_get_mulmat_bcast_shape(
const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
// Bcast macro to avoid duplicate code.
#define BCAST_MUL_MAT_SHAPE(input, weight, dst) \
int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2]; \
int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2]; \
int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2]; \
size_t bcast_##input##_nb[GGML_MAX_DIMS * 2]; \
size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2]; \
size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2]; \
int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape( \
input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne, \
bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
#define BCAST_MUL_MAT_PARAM(tensor) \
bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
#endif // CANN_ACL_TENSOR_H

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,592 @@
#ifndef CANN_ACLNN_OPS
#define CANN_ACLNN_OPS
/**
* @file acl_tensor
* @brief This file contains related functions of ggml_tensor and acl_tensor.
* Contains conversion from ggml_tensor to acl_tensor, broadcast and other
* functions.
* @author hipudding <huafengchun@gmail.com>
* @author wangshuai09 <391746016@qq.com>
* @date July 15, 2024
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <aclnnop/aclnn_add.h>
#include <aclnnop/aclnn_arange.h>
#include <aclnnop/aclnn_argsort.h>
#include <aclnnop/aclnn_cat.h>
#include <aclnnop/aclnn_clamp.h>
#include <aclnnop/aclnn_div.h>
#include <aclnnop/aclnn_gelu.h>
#include <aclnnop/aclnn_hardsigmoid.h>
#include <aclnnop/aclnn_hardswish.h>
#include <aclnnop/aclnn_leaky_relu.h>
#include <aclnnop/aclnn_mul.h>
#include <aclnnop/aclnn_relu.h>
#include <aclnnop/aclnn_silu.h>
#include <aclnnop/aclnn_tanh.h>
#include "acl_tensor.h"
#include "common.h"
/**
* @brief Repeats a ggml tensor along each dimension to match the dimensions
* of another tensor.
*
* @details This function repeats the elements of a source ggml tensor along
* each dimension to create a destination tensor with the specified
* dimensions. The operation is performed using the ACL backend and
* executed asynchronously on the device.
*
* @param ctx The CANN context used for operations.
* @param dst The ggml tensor representing the destination, which op is
* GGML_OP_REPEAT and specifies the desired dimensions.
*/
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Adds two ggml tensors using the CANN backend.
*
* @details This function performs an element-wise addition of two tensors. In
* case the tensors do not have the same shape, one or both tensors
* will be broadcasted to match the shape of the other before the
* addition is performed.The formula for the operation is given by:
* \f[
* \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1}
* \f]
*
* @param ctx The CANN context used for operations.
* @param dst The ggml tensor representing the destination, result of the
* addition is stored at dst->data, and dst->op is `GGML_OP_ADD`
*/
void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Applies the Leaky ReLU activation function to a tensor using the CANN
* backend.
*
* @details This function computes the Leaky ReLU activation for each element of
* the input tensor. The Leaky ReLU function allows a small gradient
* when the unit is not active (i.e., when the input is negative). The
* Leaky ReLU function is defined as:
* \f[
* \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
* src)
* \f]
* `negativeSlope` is in dst->params.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the result of the Leaky ReLU
* activation is stored, which op is `GGML_OP_LEAKY_RELU`
*/
void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Concatenates multiple tensors along a specified dimension using the
* CANN backend.
*
* @param ctx The CANN context used for operations.
* @param tensorList A pointer to the list of tensors to be concatenated.
* @param dst The destination tensor where the result of the
* concatenation is stored. dst->op is `GGML_OP_CONCAT`.
* @param concat_dim The dimension along which the tensors are concatenated.
*
* @attention tensorList length should be 2 and the dimension using for concat
* default to 1.
*/
void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Generates a sequence of evenly spaced values within a specified
* interval for a ggml tensor using the CANN backend.
*
* @details This function creates a sequence of numbers over a specified i
* nterval, starting from `start`, ending before `stop`, and
* incrementing by `step`. The sequence is stored in the destination
* tensor `dst`.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the generated sequence will be stored.
* `start`, 'stop' and 'step' are in dst->op_params and dst->op is
* `GGML_OP_ARANGE`.
*/
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the square of the elements of a ggml tensor using the CANN
* backend.
* @details The function sets the second source tensor of the destination
* tensor `dst` to be equal to the first source tensor. This is
* effectively squaring the elements since the multiplication becomes
* `element * element`.
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the squared values will be stored
* which dst->op is `GGML_OP_SQR`.
*/
void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Applies a clamp operation to the elements of a ggml tensor using the
* CANN backend.
*
* @details This function clamps the elements of the input tensor `src` to a
* specified range defined by `min` and `max` values. The result is
* stored in the destination tensor `dst`. The operation is defined as:
* \f[
* y = \max(\min(x, max\_value), min\_value)
* \f]
* where `x` is an element of the input tensor, and `y` is the
* corresponding element in the output tensor.
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the clamped values will be stored.
* dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
*/
void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Scales the elements of a ggml tensor by a constant factor using the
* CANN backend.
*
* @details This function multiplies each element of the input tensor `src` by
* a scaling factor `scale`, storing the result in the destination
* tensor `dst`. The operation is defined as:
* \f[
* dst = src \times scale
* \f]
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the scaled values will be stored.
* dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
*/
void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Sorts the elements of a ggml tensor and returns the indices that
* would sort the tensor using the CANN backend.
*
* @details This function performs an argsort operation on the input tensor
* `src`. It sorts the elements of `src` in either ascending or
* descending order, depending on the `GGML_SORT_ORDER_DESC`,
* and returns the indices that would sort the original tensor.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the sorted indices will be stored.
* dst->op is `GGML_OP_ARGSORT`.
*/
void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the Layer Normalization for a ggml tensor using the CANN
* backend.
*
* @details This function applies the Layer Normalization operation on the
* input tensor `src` and stores the result in the destination tensor
* `dst`. Layer Normalization normalizes the features at each sample in
* a mini-batch independently. It is commonly used in neural networks
* to normalize the activations of a layer by adjusting and scaling
* the outputs.
* The operation is defined as:
* \f[
* \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
* \f]
* `Var` defaults dst->ne[0]. `eps` is in dst->params.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the normalized values will be stored.
* @attention `Var` defaults to dst->ne[0].
*/
void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the Group Normalization for a ggml tensor using the CANN
* backend.
*
* @brief This function applies the Group Normalization operation on the input
* tensor `src` and stores the result in the destination tensor `dst`.
* Group Normalization divides the channels into groups and normalizes
* the features within each group across spatial locations.
* It is commonly used in convolutional neural networks to improve
* training stability and performance.
* The operation is defined as:
* \f[
* \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
* \f]
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the normalized values will be stored.
* `n_groups` is in dst->params, which split C channel to `n_groups`.
* dst->op is `GGML_OP_GROUP_NORM`.
*
* @attention eps defaults to 1e-6f.
*/
void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the accumulation of tensors using the CANN backend.
*
* @details This function performs an accumulation operation on two tensors.
* Depending on the `inplace` flag, it either updates the destination
* tensor `dst` in place by adding `alpha * src1` to it, or it creates
* a new tensor as the result of `src0 + alpha * src1` and stores it in
* `dst`.
* The operation is defined as:
* \f[
* dst = src0 + alpha \times src1
* \f]
* if `inplace` is `true`, `src0` is equal to 'dst'.
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the accumulated values will be stored.
* `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
*/
void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the sum of elements along the last dimension of a ggml tensor
* using the CANN backend.
*
* @details This function performs a reduction sum operation along the last
* dimension of the input tensor `src`. The result of the sum is stored
* in the destination tensor `dst`.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the reduced values will be stored
* dst->op is `GGML_OP_SUM_ROWS`.
*
* @attention `reduce_dims` defaults to 3, which means the last dimension.
*/
void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Upsamples a ggml tensor using nearest neighbor interpolation using
* the CANN backend.
*
* @details This function performs upsampling of the input tensor `src` using
* nearest neighbor interpolation. The upsampling is applied to the
* height and width dimensions (last two dimensions) of the tensor. The
* result is stored in the destination tensor `dst`, which must have
* the appropriate dimensions for the upsampled output.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the upsampled values will be stored.
* dst->op is `GGML_OP_UPSCALE`.
*/
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
ggml_tensor* dst);
/**
* @brief Pads a ggml tensor to match the dimensions of the destination tensor
* using the CANN backend.
*
* @details This function pads the input tensor `src` so that it matches the
* dimensions of the destination tensor `dst`. The amount of padding
* is calculated based on the difference in sizes between `src` and
* `dst` along each dimension. The padded tensor is stored in `dst`.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor, which specifies the target dimensions for
* padding. dst->op is `GGML_OP_PAD`.
*/
void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Executes a 2D pooling operation on a ggml tensor using the CANN
* backend.
*
* @details This function dispatches the execution of a 2D pooling operation on
* the input tensor `dst`. The type of pooling (average or max) is
* determined by the `op` parameter, which is read from the operation
* parameters of `dst`. The function supports average pooling
* (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
* invalid operation is encountered, the function asserts a failure.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor on which the pooling operation is to be
* performed. dst->op is `GGML_OP_POOL_2D`.
*/
void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Duplicates a ggml tensor using the CANN backend.
*
* @details This function duplicates the contents of the source tensor `src` to
* the destination tensor `dst`. The function supports various tensor
* types and configurations, including handling of extra data, type
* conversions, and special cases for contiguous and non-contiguous
* tensors.
*
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the duplicated data will be stored.
* dst->op is `GGML_OP_DUP`
*
* @attention Only support Fp16/FP32. Not support when src and dst have
* different shape and dst is no-contiguous.
* @note: This func need to simplify.
*/
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the Root Mean Square (RMS) normalization of a ggml tensor
* using the CANN backend.
*
* @details This function applies RMS normalization to the input tensor `src`
* and stores the result in the destination tensor `dst`. RMS
* normalization involves computing the root mean square of the input
* tensor along a specified dimension and then dividing each element of
* the tensor by this value, adjusted by a small epsilon value to
* prevent division by zero.
* The operation is defined as:
* \f[
* \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
* \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
* \f]
* `eps` is in dst->op_params.
* @param ctx The CANN context used for operations.
* @param dst The destination tensor where the normalized values will be stored.
* dst->op is `GGML_OP_RMS_NORM`.
*/
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Applies a diagonal mask to the tensor with a specified value.
*
* @details This function creates a mask tensor filled with ones, then applies
* an upper triangular and lower triangular operation to it based on
* the number of past elements specified. Afterward, it adds the masked
* tensor to the destination tensor in-place.
*
* @param ctx The backend CANN context used for operations.
* @param dst The destination tensor where the result will be stored. dst->op is
* `GGML_OP_DIAG_MASK`
* @param value The value to use for masking.
*/
void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
/**
* @brief Performs an image-to-column transformation on the input tensor.
*
* @details This function takes an input tensor and applies an image-to-column
* operation, converting spatial dimensions into column-like
* structures suitable for convolutional operations. It supports both
* half-precision (F16) and single-precision (F32) floating-point data
* types.
*
* @param ctx The backend CANN context for executing operations.
* @param dst The destination tensor that stores the result of the operation.
* dst->op is `GGML_OP_IM2COL`.
*/
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes time step embeddings using sine and cosine functions.
*
* @details This function calculates time step embeddings by applying sine and
* cosine transformations to a given input tensor, which is typically
* used in temporal models like diffusion models or transformers to
* encode time information effectively.
*
* @param ctx The backend CANN context for executing operations.
* @param dst The destination tensor where the result of the embedding operation
* will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
*/
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
// @see ggml_cann_dup.
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Computes the softmax activation with optional masking.
*
* @details This function computes the softmax activation over the input tensor,
* optionally applying a mask and scaling factor. It supports both FP16
* and FP32 data types and can handle masking by broadcasting the mask
* across rows if necessary.
* The function performs the following steps:
* 1. Multiplies the input tensor by a scale factor.
* 2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
* 3. Broadcasts the mask tensor if its dimensions do not match the
* input tensor's dimensions.
* 4. Adds the mask to the scaled input tensor.
* 5. Applies the softmax activation function along the specified
* dimension.
*
* @param ctx The backend CANN context for executing operations.
* @param dst The destination tensor where the result will be stored. dst->op is
* `GGML_OP_SOFTMAX`.
*/
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Extracts specific rows from a tensor based on indices.
*
* @details This function retrieves rows from a source tensor src0 according to
* the indices provided in another tensor src1 and stores the result in
* a destination tensor (\p dst). It supports different data types
* including F32, F16, Q4_0, and Q8_0.
*
* @param ctx The backend CANN context for executing operations.
* @param dst The destination tensor where the extracted rows will be stored.
* dst->op is `GGML_OP_GET_ROWS`.
*/
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Executes matrix multiplication for the given tensor.
*
* @details This function performs matrix multiplication on the source tensors
* associated with the destination tensor. It supports matrix
* multiplication F32, F16, and Q8_0.
*
* @param ctx The backend CANN context for executing operations.
* @param dst The destination tensor for storing the result of the matrix
* multiplication. dst->op is `GGML_OP_MUL_MAT`.
*/
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
/**
* @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
*
* @details This function implements the RoPE mechanism, which is a method to
* encode positional information into sequence data, particularly
* useful in transformer models. It supports both F32 and F16 data
* types.
*
* @param ctx The backend CANN context for executing operations.
* @param dst The destination tensor where the RoPE-transformed data will be
* stored. dst->op is `GGML_OP_ROPE`.
*
* @note The function currently does not support cases where the n_dims is less
* than the input tensor's first dimension.
* @note The function currently does not support cases where the freq_factors is
* not NULL.
* @note The function currently does not support cases where the ext_factor is
* not equal 0.
* @note The function currently does not support cases where the freq_scale is
* not equal 1.
*/
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
aclTensor*, uint64_t*, aclOpExecutor**),
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src0 = dst->src[0];
ggml_tensor* src1 = dst->src[1];
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
aclTensor* acl_src0;
aclTensor* acl_src1;
aclTensor* acl_dst;
// Need bcast
if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
BCAST_SHAPE(src0, src1)
acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
} else {
acl_src0 = ggml_cann_create_tensor(src0);
acl_src1 = ggml_cann_create_tensor(src1);
acl_dst = ggml_cann_create_tensor(dst);
}
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
&executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
aclrtStream main_stream = ctx.stream();
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
ACL_CHECK(aclDestroyTensor(acl_src0));
ACL_CHECK(aclDestroyTensor(acl_src1));
ACL_CHECK(aclDestroyTensor(acl_dst));
}
// Activation functions template.
template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
aclOpExecutor**),
aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
const aclrtStream)>
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
aclTensor* acl_src = ggml_cann_create_tensor(src);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
aclrtStream main_stream = ctx.stream();
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_dst));
}
// Activation functions template for const aclTensors.
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
uint64_t*, aclOpExecutor**),
aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
const aclrtStream)>
void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];
GGML_ASSERT(src->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
aclTensor* acl_src = ggml_cann_create_tensor(src);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
aclrtStream main_stream = ctx.stream();
ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_dst));
}
#endif // CANN_ACLNN_OPS

282
ggml/src/ggml-cann/common.h Normal file
View file

@ -0,0 +1,282 @@
/*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef CANN_COMMON_H
#define CANN_COMMON_H
#include <acl/acl.h>
#include <cstdio>
#include <iostream>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "../include/ggml-cann.h"
#include "../include/ggml.h"
#define MATRIX_ROW_PADDING 512
#define GGML_CANN_MAX_STREAMS 8
/**
* @brief Handles CANN-related errors by printing an error message and
* terminating the program.
* @param stmt The statement that caused the error.
* @param func The function in which the error occurred.
* @param file The file in which the error occurred.
* @param line The line number at which the error occurred.
* @param msg The error message.
*/
[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
const char* file, int line, const char* msg);
/**
* @brief Checks the result of a CANN function call and invokes the error
* handler if the call fails.
* @param stmt The CANN function call to check.
* @param success The success code that indicates the call was successful.
* @param error_fn The function to call to retrieve the error message.
*/
#define ACL_CHECK_GEN(stmt, success, error_fn) \
do { \
int err_code = (stmt); \
if (err_code != (success)) { \
ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \
} \
} while (0);
#define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg)
/**
* @brief Contains information about CANN devices.
*/
struct ggml_cann_device_info {
/**
* @brief Number of CANN devices available.
*/
int32_t device_count;
/**
* @brief Information about a single CANN device.
*/
struct cann_device_info {
int cc; /**< Compute capability. */
size_t smpb; /**< Maximum shared memory per block. */
bool vmm; /**< Virtual memory support. */
size_t vmm_granularity; /**< Granularity of virtual memory. */
size_t total_vram; /**< Total video RAM available on the device. */
};
cann_device_info devices[GGML_CANN_MAX_DEVICES] =
{}; /**< Array of CANN device information. */
};
const ggml_cann_device_info& ggml_cann_info();
void ggml_cann_set_device(int32_t device);
int32_t ggml_cann_get_device();
/**
* @brief Abstract base class for memory pools used by CANN.
*/
struct ggml_cann_pool {
/**
* @brief Virtual destructor for the memory pool.
*/
virtual ~ggml_cann_pool() = default;
/**
* @brief Allocates memory from the pool.
*
* @param size The size of the memory block to allocate.
* @param actual_size Pointer to a variable where the actual allocated size
* will be stored.
* @return Pointer to the allocated memory block.
*/
virtual void* alloc(size_t size, size_t* actual_size) = 0;
/**
* @brief Frees a previously allocated memory block.
*
* @param ptr Pointer to the memory block to free.
* @param size Size of the memory block to free.
* @note Note that all CANN opertors are running async. Make sure memory is
* still avaiable before this operator finished.
*/
virtual void free(void* ptr, size_t size) = 0;
};
/**
* @brief RAII wrapper for managing memory allocations from a CANN memory pool.
*/
struct ggml_cann_pool_alloc {
ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
void* ptr = nullptr; /**< Pointer to the allocated memory block. */
size_t actual_size = 0; /**< Actual size of the allocated memory block. */
/**
* @brief Default constructor.
*/
ggml_cann_pool_alloc() = default;
/**
* @brief Constructor that initializes the memory pool.
* @param pool Reference to the memory pool.
*/
explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
/**
* @brief Constructor that initializes the memory pool and allocates memory.
* @param pool Reference to the memory pool.
* @param size Size of the memory block to allocate.
*/
ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
alloc(size);
}
/**
* @brief Destructor that frees the allocated memory block.
*/
~ggml_cann_pool_alloc() {
if (ptr != nullptr) {
pool->free(ptr, actual_size);
}
}
/**
* @brief Allocates memory from the pool.
* @param size Size of the memory block to allocate.
* @return Pointer to the allocated memory block.
*/
void* alloc(size_t size) {
GGML_ASSERT(pool != nullptr);
GGML_ASSERT(ptr == nullptr);
ptr = pool->alloc(size, &this->actual_size);
return ptr;
}
/**
* @brief Allocates memory from a specific memory pool.
* @param pool Reference to the memory pool.
* @param size Size of the memory block to allocate.
* @return Pointer to the allocated memory block.
*/
void* alloc(ggml_cann_pool& pool, size_t size) {
this->pool = &pool;
return alloc(size);
}
/**
* @brief Gets the pointer to the allocated memory block.
* @return Pointer to the allocated memory block.
*/
void* get() { return ptr; }
// Deleted copy constructor
ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
// Deleted move constructor
ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
// Deleted copy assignment operator
ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
// Deleted move assignment operator
ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
};
/**
* @brief Context for managing CANN backend operations.
*/
struct ggml_backend_cann_context {
int32_t device; /**< Device ID. */
std::string name; /**< Name of the device. */
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
{nullptr}}; /**< Array of streams for the device. */
/**
* @brief Constructor for initializing the context with a given device.
* @param device Device ID.
*/
explicit ggml_backend_cann_context(int device)
: device(device), name("CANN" + std::to_string(device)) {}
/**
* @brief Destructor for cleaning up resources.
*/
~ggml_backend_cann_context() {
if (copy_event != nullptr) {
ACL_CHECK(aclrtDestroyEvent(copy_event));
}
for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
if (streams[i] != nullptr) {
ACL_CHECK(aclrtDestroyStream(streams[i]));
}
}
}
/**
* @brief Get or create a stream for a given index.
* @param stream Index of the stream.
* @return The stream corresponding to the given index.
*/
aclrtStream stream(int stream) {
if (streams[stream] == nullptr) {
ggml_cann_set_device(device);
ACL_CHECK(aclrtCreateStream(&streams[stream]));
}
return streams[stream];
}
/**
* @brief Get or create the default stream (index 0).
* @return The default stream.
*/
aclrtStream stream() { return stream(0); }
// TODO: each stream should have a memory pool.
std::unique_ptr<ggml_cann_pool>
mem_pool; /**< Memory pool for the device. */
/**
* @brief Create a new memory pool for a given device.
* @param device Device ID.
* @return A unique pointer to the new memory pool.
*/
static std::unique_ptr<ggml_cann_pool> new_pool_for_device(int device);
/**
* @brief Get or create the memory pool for the context.
* @return Reference to the memory pool.
*/
ggml_cann_pool& pool() {
if (mem_pool == nullptr) {
mem_pool = new_pool_for_device(device);
}
return *mem_pool;
}
};
#endif // CANN_COMMON_H

Some files were not shown because too many files have changed in this diff Show more