Merge commit '978ba3d8
' into tokenizer-codepoint-categs
This commit is contained in:
commit
8bd3749c26
152 changed files with 9036 additions and 11397 deletions
|
@ -14,7 +14,9 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
echo "GGML_SYCL_F16 is set" && \
|
echo "GGML_SYCL_F16 is set" && \
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
echo "Building with static libs" && \
|
||||||
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
|
||||||
|
${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||||
|
|
|
@ -14,6 +14,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
echo "GGML_SYCL_F16 is set" && \
|
echo "GGML_SYCL_F16 is set" && \
|
||||||
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
|
echo "Building with dynamic libs" && \
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target llama-server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,6 @@
|
||||||
"llama-embedding"
|
"llama-embedding"
|
||||||
"llama-server"
|
"llama-server"
|
||||||
"llama-quantize"
|
"llama-quantize"
|
||||||
"llama-train-text-from-scratch"
|
|
||||||
];
|
];
|
||||||
mkApp = name: {
|
mkApp = name: {
|
||||||
type = "app";
|
type = "app";
|
||||||
|
|
|
@ -126,16 +126,9 @@ let
|
||||||
++ optionals useMetalKit [ MetalKit ];
|
++ optionals useMetalKit [ MetalKit ];
|
||||||
|
|
||||||
cudaBuildInputs = with cudaPackages; [
|
cudaBuildInputs = with cudaPackages; [
|
||||||
cuda_cccl.dev # <nv/target>
|
cuda_cudart
|
||||||
|
cuda_cccl # <nv/target>
|
||||||
# A temporary hack for reducing the closure size, remove once cudaPackages
|
libcublas
|
||||||
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
|
|
||||||
cuda_cudart.dev
|
|
||||||
cuda_cudart.lib
|
|
||||||
cuda_cudart.static
|
|
||||||
libcublas.dev
|
|
||||||
libcublas.lib
|
|
||||||
libcublas.static
|
|
||||||
];
|
];
|
||||||
|
|
||||||
rocmBuildInputs = with rocmPackages; [
|
rocmBuildInputs = with rocmPackages; [
|
||||||
|
|
|
@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./llama-quantize "$@"
|
./llama-quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
./llama-cli "$@"
|
./llama-cli "$@"
|
||||||
elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
|
|
||||||
./llama-finetune "$@"
|
|
||||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
echo "Converting PTH to GGML..."
|
echo "Converting PTH to GGML..."
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||||
|
@ -36,8 +34,6 @@ else
|
||||||
echo " ex: --outtype f16 \"/models/7B/\" "
|
echo " ex: --outtype f16 \"/models/7B/\" "
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
echo " --quantize (-q): Optimize with quantization process ggml"
|
||||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
||||||
echo " --finetune (-f): Run finetune command to create a lora finetune of the model"
|
|
||||||
echo " See documentation for finetune for command-line parameters"
|
|
||||||
echo " --all-in-one (-a): Execute --convert & --quantize"
|
echo " --all-in-one (-a): Execute --convert & --quantize"
|
||||||
echo " ex: \"/models/\" 7B"
|
echo " ex: \"/models/\" 7B"
|
||||||
echo " --server (-s): Run a model on the server"
|
echo " --server (-s): Run a model on the server"
|
||||||
|
|
1
.github/workflows/build.yml
vendored
1
.github/workflows/build.yml
vendored
|
@ -860,6 +860,7 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
||||||
|
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -50,6 +50,7 @@ build*
|
||||||
!docs/build.md
|
!docs/build.md
|
||||||
/libllama.so
|
/libllama.so
|
||||||
/llama-*
|
/llama-*
|
||||||
|
/vulkan-shaders-gen
|
||||||
android-ndk-*
|
android-ndk-*
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
cmake-build-*
|
cmake-build-*
|
||||||
|
|
|
@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
|
||||||
# determining _precisely_ which defines are necessary for the llama-config
|
# determining _precisely_ which defines are necessary for the llama-config
|
||||||
# package.
|
# package.
|
||||||
#
|
#
|
||||||
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
|
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
|
||||||
|
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
|
||||||
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
||||||
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
|
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
|
||||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||||
|
|
|
@ -1,12 +1,17 @@
|
||||||
# Pull requests
|
# Pull requests (for contributors)
|
||||||
|
|
||||||
- Always squash-merge the PR before merging
|
|
||||||
- Use the following format for your final commit: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
|
||||||
- Test your changes:
|
- Test your changes:
|
||||||
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||||
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
|
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
|
||||||
|
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
||||||
|
|
||||||
|
# Pull requests (for collaborators)
|
||||||
|
|
||||||
|
- Squash-merge PRs
|
||||||
|
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||||
|
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
||||||
|
|
||||||
# Coding guidelines
|
# Coding guidelines
|
||||||
|
|
||||||
|
|
156
Makefile
156
Makefile
|
@ -11,7 +11,6 @@ BUILD_TARGETS = \
|
||||||
llama-embedding \
|
llama-embedding \
|
||||||
llama-eval-callback \
|
llama-eval-callback \
|
||||||
llama-export-lora \
|
llama-export-lora \
|
||||||
llama-finetune \
|
|
||||||
llama-gbnf-validator \
|
llama-gbnf-validator \
|
||||||
llama-gguf \
|
llama-gguf \
|
||||||
llama-gguf-hash \
|
llama-gguf-hash \
|
||||||
|
@ -37,7 +36,6 @@ BUILD_TARGETS = \
|
||||||
llama-simple \
|
llama-simple \
|
||||||
llama-speculative \
|
llama-speculative \
|
||||||
llama-tokenize \
|
llama-tokenize \
|
||||||
llama-train-text-from-scratch \
|
|
||||||
llama-vdot \
|
llama-vdot \
|
||||||
llama-cvector-generator \
|
llama-cvector-generator \
|
||||||
tests/test-c.o
|
tests/test-c.o
|
||||||
|
@ -64,13 +62,13 @@ TEST_TARGETS = \
|
||||||
tests/test-tokenizer-1-spm
|
tests/test-tokenizer-1-spm
|
||||||
|
|
||||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
|
retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
|
||||||
|
|
||||||
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
||||||
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
|
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
|
||||||
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
|
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
|
||||||
|
|
||||||
# Deprecation aliases
|
# Deprecation aliases
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
|
@ -327,9 +325,9 @@ ifdef LLAMA_DEBUG
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
MK_CPPFLAGS += -DNDEBUG
|
MK_CPPFLAGS += -DNDEBUG
|
||||||
MK_CFLAGS += -O3
|
MK_CFLAGS += -O3 -g
|
||||||
MK_CXXFLAGS += -O3
|
MK_CXXFLAGS += -O3 -g
|
||||||
MK_NVCCFLAGS += -O3
|
MK_NVCCFLAGS += -O3 -g
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SANITIZE_THREAD
|
ifdef LLAMA_SANITIZE_THREAD
|
||||||
|
@ -530,10 +528,21 @@ ifndef GGML_NO_ACCELERATE
|
||||||
endif
|
endif
|
||||||
endif # GGML_NO_ACCELERATE
|
endif # GGML_NO_ACCELERATE
|
||||||
|
|
||||||
|
ifdef GGML_MUSA
|
||||||
|
CC := clang
|
||||||
|
CXX := clang++
|
||||||
|
GGML_CUDA := 1
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_MUSA
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef GGML_NO_OPENMP
|
ifndef GGML_NO_OPENMP
|
||||||
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
MK_CPPFLAGS += -DGGML_USE_OPENMP
|
||||||
MK_CFLAGS += -fopenmp
|
MK_CFLAGS += -fopenmp
|
||||||
MK_CXXFLAGS += -fopenmp
|
MK_CXXFLAGS += -fopenmp
|
||||||
|
ifdef GGML_MUSA
|
||||||
|
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
|
||||||
|
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
|
||||||
|
endif # GGML_MUSA
|
||||||
endif # GGML_NO_OPENMP
|
endif # GGML_NO_OPENMP
|
||||||
|
|
||||||
ifdef GGML_OPENBLAS
|
ifdef GGML_OPENBLAS
|
||||||
|
@ -584,15 +593,27 @@ else
|
||||||
endif # GGML_CUDA_FA_ALL_QUANTS
|
endif # GGML_CUDA_FA_ALL_QUANTS
|
||||||
|
|
||||||
ifdef GGML_CUDA
|
ifdef GGML_CUDA
|
||||||
ifneq ('', '$(wildcard /opt/cuda)')
|
ifdef GGML_MUSA
|
||||||
CUDA_PATH ?= /opt/cuda
|
ifneq ('', '$(wildcard /opt/musa)')
|
||||||
else
|
CUDA_PATH ?= /opt/musa
|
||||||
CUDA_PATH ?= /usr/local/cuda
|
else
|
||||||
endif
|
CUDA_PATH ?= /usr/local/musa
|
||||||
|
endif
|
||||||
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
|
||||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
|
||||||
MK_NVCCFLAGS += -use_fast_math
|
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
|
||||||
|
else
|
||||||
|
ifneq ('', '$(wildcard /opt/cuda)')
|
||||||
|
CUDA_PATH ?= /opt/cuda
|
||||||
|
else
|
||||||
|
CUDA_PATH ?= /usr/local/cuda
|
||||||
|
endif
|
||||||
|
|
||||||
|
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
||||||
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
||||||
|
MK_NVCCFLAGS += -use_fast_math
|
||||||
|
endif # GGML_MUSA
|
||||||
|
|
||||||
OBJ_GGML += ggml/src/ggml-cuda.o
|
OBJ_GGML += ggml/src/ggml-cuda.o
|
||||||
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
||||||
|
@ -602,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS
|
||||||
MK_NVCCFLAGS += -Werror all-warnings
|
MK_NVCCFLAGS += -Werror all-warnings
|
||||||
endif # LLAMA_FATAL_WARNINGS
|
endif # LLAMA_FATAL_WARNINGS
|
||||||
|
|
||||||
|
ifndef GGML_MUSA
|
||||||
ifndef JETSON_EOL_MODULE_DETECT
|
ifndef JETSON_EOL_MODULE_DETECT
|
||||||
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
||||||
endif # JETSON_EOL_MODULE_DETECT
|
endif # JETSON_EOL_MODULE_DETECT
|
||||||
|
endif # GGML_MUSA
|
||||||
|
|
||||||
ifdef LLAMA_DEBUG
|
ifdef LLAMA_DEBUG
|
||||||
MK_NVCCFLAGS += -lineinfo
|
MK_NVCCFLAGS += -lineinfo
|
||||||
|
@ -617,8 +640,12 @@ endif # GGML_CUDA_DEBUG
|
||||||
ifdef GGML_CUDA_NVCC
|
ifdef GGML_CUDA_NVCC
|
||||||
NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
|
NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
|
||||||
else
|
else
|
||||||
NVCC = $(CCACHE) nvcc
|
ifdef GGML_MUSA
|
||||||
endif #GGML_CUDA_NVCC
|
NVCC = $(CCACHE) mcc
|
||||||
|
else
|
||||||
|
NVCC = $(CCACHE) nvcc
|
||||||
|
endif # GGML_MUSA
|
||||||
|
endif # GGML_CUDA_NVCC
|
||||||
|
|
||||||
ifdef CUDA_DOCKER_ARCH
|
ifdef CUDA_DOCKER_ARCH
|
||||||
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||||
|
@ -689,9 +716,15 @@ define NVCC_COMPILE
|
||||||
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||||
endef # NVCC_COMPILE
|
endef # NVCC_COMPILE
|
||||||
else
|
else
|
||||||
|
ifdef GGML_MUSA
|
||||||
|
define NVCC_COMPILE
|
||||||
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
|
||||||
|
endef # NVCC_COMPILE
|
||||||
|
else
|
||||||
define NVCC_COMPILE
|
define NVCC_COMPILE
|
||||||
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
||||||
endef # NVCC_COMPILE
|
endef # NVCC_COMPILE
|
||||||
|
endif # GGML_MUSA
|
||||||
endif # JETSON_EOL_MODULE_DETECT
|
endif # JETSON_EOL_MODULE_DETECT
|
||||||
|
|
||||||
ggml/src/ggml-cuda/%.o: \
|
ggml/src/ggml-cuda/%.o: \
|
||||||
|
@ -876,6 +909,9 @@ OBJ_GGML += \
|
||||||
|
|
||||||
OBJ_LLAMA = \
|
OBJ_LLAMA = \
|
||||||
src/llama.o \
|
src/llama.o \
|
||||||
|
src/llama-vocab.o \
|
||||||
|
src/llama-grammar.o \
|
||||||
|
src/llama-sampling.o \
|
||||||
src/unicode.o \
|
src/unicode.o \
|
||||||
src/unicode-data.o
|
src/unicode-data.o
|
||||||
|
|
||||||
|
@ -943,6 +979,7 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||||
ifdef GGML_CUDA
|
ifdef GGML_CUDA
|
||||||
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||||
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
|
||||||
|
ifndef GGML_MUSA
|
||||||
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
|
||||||
|
|
||||||
ifndef CUDA_DOCKER_ARCH
|
ifndef CUDA_DOCKER_ARCH
|
||||||
|
@ -952,6 +989,7 @@ endif # CUDA_POWER_ARCH
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
|
|
||||||
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
|
||||||
|
endif # GGML_MUSA
|
||||||
endif # GGML_CUDA
|
endif # GGML_CUDA
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
|
@ -1055,6 +1093,10 @@ src/unicode-data.o: \
|
||||||
|
|
||||||
src/llama.o: \
|
src/llama.o: \
|
||||||
src/llama.cpp \
|
src/llama.cpp \
|
||||||
|
src/llama-impl.h \
|
||||||
|
src/llama-vocab.h \
|
||||||
|
src/llama-grammar.h \
|
||||||
|
src/llama-sampling.h \
|
||||||
src/unicode.h \
|
src/unicode.h \
|
||||||
include/llama.h \
|
include/llama.h \
|
||||||
ggml/include/ggml-cuda.h \
|
ggml/include/ggml-cuda.h \
|
||||||
|
@ -1064,6 +1106,29 @@ src/llama.o: \
|
||||||
ggml/include/ggml-backend.h
|
ggml/include/ggml-backend.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
src/llama-vocab.o: \
|
||||||
|
src/llama-vocab.cpp \
|
||||||
|
src/llama-vocab.h \
|
||||||
|
src/llama-impl.h \
|
||||||
|
include/llama.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
src/llama-grammar.o: \
|
||||||
|
src/llama-grammar.cpp \
|
||||||
|
src/llama-grammar.h \
|
||||||
|
src/llama-impl.h \
|
||||||
|
src/llama-vocab.h \
|
||||||
|
src/llama-sampling.h \
|
||||||
|
include/llama.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
src/llama-sampling.o: \
|
||||||
|
src/llama-sampling.cpp \
|
||||||
|
src/llama-sampling.h \
|
||||||
|
src/llama-impl.h \
|
||||||
|
include/llama.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
$(LIB_LLAMA): \
|
$(LIB_LLAMA): \
|
||||||
$(OBJ_LLAMA) \
|
$(OBJ_LLAMA) \
|
||||||
$(LIB_GGML)
|
$(LIB_GGML)
|
||||||
|
@ -1266,11 +1331,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \
|
|
||||||
$(OBJ_ALL)
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
||||||
$(OBJ_GGML) $(OBJ_LLAMA)
|
$(OBJ_GGML) $(OBJ_LLAMA)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
@ -1286,13 +1346,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-finetune: examples/finetune/finetune.cpp \
|
|
||||||
$(OBJ_ALL)
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
llama-export-lora: examples/export-lora/export-lora.cpp \
|
llama-export-lora: examples/export-lora/export-lora.cpp \
|
||||||
$(OBJ_GGML) common/log.h
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -1439,7 +1494,7 @@ run-benchmark-matmult: llama-benchmark-matmult
|
||||||
.PHONY: run-benchmark-matmult swift
|
.PHONY: run-benchmark-matmult swift
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
||||||
$(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -1548,56 +1603,45 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
||||||
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
|
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
|
||||||
#
|
#
|
||||||
# Mark legacy binary targets as .PHONY so that they are always checked.
|
# Mark legacy binary targets as .PHONY so that they are always checked.
|
||||||
.PHONY: main quantize perplexity embedding server finetune
|
.PHONY: main quantize perplexity embedding server
|
||||||
|
|
||||||
|
# Define the object file target
|
||||||
|
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
|
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
|
||||||
# Eventually we will want to remove these target from building all the time.
|
# Eventually we will want to remove these target from building all the time.
|
||||||
main: examples/deprecation-warning/deprecation-warning.cpp
|
main: examples/deprecation-warning/deprecation-warning.o
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
|
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
|
||||||
|
|
||||||
server: examples/deprecation-warning/deprecation-warning.cpp
|
server: examples/deprecation-warning/deprecation-warning.o
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
|
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
|
||||||
|
|
||||||
quantize: examples/deprecation-warning/deprecation-warning.cpp
|
quantize: examples/deprecation-warning/deprecation-warning.o
|
||||||
ifneq (,$(wildcard quantize))
|
ifneq (,$(wildcard quantize))
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
|
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
|
||||||
@echo " Remove the 'quantize' binary to remove this warning."
|
@echo " Remove the 'quantize' binary to remove this warning."
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
perplexity: examples/deprecation-warning/deprecation-warning.cpp
|
perplexity: examples/deprecation-warning/deprecation-warning.o
|
||||||
ifneq (,$(wildcard perplexity))
|
ifneq (,$(wildcard perplexity))
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
|
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
|
||||||
@echo " Remove the 'perplexity' binary to remove this warning."
|
@echo " Remove the 'perplexity' binary to remove this warning."
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
embedding: examples/deprecation-warning/deprecation-warning.cpp
|
embedding: examples/deprecation-warning/deprecation-warning.o
|
||||||
ifneq (,$(wildcard embedding))
|
ifneq (,$(wildcard embedding))
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
|
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
|
||||||
@echo " Remove the 'embedding' binary to remove this warning."
|
@echo " Remove the 'embedding' binary to remove this warning."
|
||||||
@echo "#########"
|
@echo "#########"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
finetune: examples/deprecation-warning/deprecation-warning.cpp
|
|
||||||
ifneq (,$(wildcard finetune))
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
@echo "#########"
|
|
||||||
@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
|
|
||||||
@echo " Remove the 'finetune' binary to remove this warning."
|
|
||||||
@echo "#########"
|
|
||||||
endif
|
|
||||||
|
|
|
@ -4,6 +4,9 @@ import PackageDescription
|
||||||
|
|
||||||
var sources = [
|
var sources = [
|
||||||
"src/llama.cpp",
|
"src/llama.cpp",
|
||||||
|
"src/llama-vocab.cpp",
|
||||||
|
"src/llama-grammar.cpp",
|
||||||
|
"src/llama-sampling.cpp",
|
||||||
"src/unicode.cpp",
|
"src/unicode.cpp",
|
||||||
"src/unicode-data.cpp",
|
"src/unicode-data.cpp",
|
||||||
"ggml/src/ggml.c",
|
"ggml/src/ggml.c",
|
||||||
|
|
|
@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
|
|
||||||
Unless otherwise noted these projects are open-source with permissive licensing:
|
Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
|
|
||||||
|
- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT)
|
||||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
||||||
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
- [janhq/jan](https://github.com/janhq/jan) (AGPL)
|
||||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||||
|
@ -181,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
|
|
||||||
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
|
||||||
|
|
||||||
|
**Games:**
|
||||||
|
- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
|
||||||
|
|
||||||
## Demo
|
## Demo
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
|
@ -405,6 +409,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
|
||||||
| [BLAS](./docs/build.md#blas-build) | All |
|
| [BLAS](./docs/build.md#blas-build) | All |
|
||||||
| [BLIS](./docs/backend/BLIS.md) | All |
|
| [BLIS](./docs/backend/BLIS.md) | All |
|
||||||
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||||
|
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
|
||||||
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
|
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
|
||||||
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
|
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
|
||||||
| [Vulkan](./docs/build.md#vulkan) | GPU |
|
| [Vulkan](./docs/build.md#vulkan) | GPU |
|
||||||
|
|
|
@ -694,11 +694,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "--lora-base") {
|
|
||||||
CHECK_ARG
|
|
||||||
params.lora_base = argv[i];
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (arg == "--control-vector") {
|
if (arg == "--control-vector") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.control_vectors.push_back({ 1.0f, argv[i], });
|
params.control_vectors.push_back({ 1.0f, argv[i], });
|
||||||
|
@ -1274,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.out_file = argv[i];
|
params.out_file = argv[i];
|
||||||
params.cvector_outfile = argv[i];
|
params.cvector_outfile = argv[i];
|
||||||
|
params.lora_outfile = argv[i];
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "-ofreq" || arg == "--output-frequency") {
|
if (arg == "-ofreq" || arg == "--output-frequency") {
|
||||||
|
@ -1328,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
else { invalid_param = true; }
|
else { invalid_param = true; }
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--no-warmup") {
|
||||||
|
params.warmup = false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
// Parse args for logging parameters
|
// Parse args for logging parameters
|
||||||
if (log_param_single_parse(argv[i])) {
|
if (log_param_single_parse(argv[i])) {
|
||||||
|
@ -1450,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
||||||
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
||||||
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
||||||
|
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
|
||||||
options.push_back({ "server infill",
|
options.push_back({ "server infill",
|
||||||
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });
|
||||||
|
|
||||||
|
@ -1583,9 +1584,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
||||||
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
||||||
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
||||||
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
|
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" });
|
||||||
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
|
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||||
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
|
|
||||||
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
options.push_back({ "*", " --control-vector FNAME", "add a control vector\n"
|
||||||
"note: this argument can be repeated to add multiple control vectors" });
|
"note: this argument can be repeated to add multiple control vectors" });
|
||||||
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
||||||
|
@ -1634,7 +1634,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
|
||||||
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
|
||||||
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
|
||||||
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
|
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
|
||||||
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
|
||||||
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
|
||||||
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
|
||||||
|
@ -1676,6 +1676,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations });
|
||||||
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" });
|
||||||
|
|
||||||
|
options.push_back({ "export-lora" });
|
||||||
|
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
||||||
|
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
||||||
|
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||||
|
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
||||||
|
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
|
||||||
for (const auto & o : options) {
|
for (const auto & o : options) {
|
||||||
|
@ -2721,7 +2728,7 @@ std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
const llama_chat_msg & new_msg,
|
const llama_chat_msg & new_msg,
|
||||||
bool add_ass) {
|
bool add_ass) {
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
|
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
|
||||||
std::vector<llama_chat_msg> chat_new(past_msg);
|
std::vector<llama_chat_msg> chat_new(past_msg);
|
||||||
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
||||||
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
||||||
|
@ -3166,7 +3173,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||||
}
|
}
|
||||||
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
||||||
}
|
}
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
||||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||||
|
|
|
@ -128,7 +128,6 @@ struct gpt_params {
|
||||||
|
|
||||||
// TODO: avoid tuple, use struct
|
// TODO: avoid tuple, use struct
|
||||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||||
std::string lora_base = ""; // base model path for the lora adapter
|
|
||||||
|
|
||||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
|
@ -255,6 +254,8 @@ struct gpt_params {
|
||||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
|
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_handle_hf_token(gpt_params & params);
|
void gpt_params_handle_hf_token(gpt_params & params);
|
||||||
|
|
|
@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl(
|
||||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
||||||
|
|
||||||
// Apply grammar constraints to the single token
|
// Apply grammar constraints to the single token
|
||||||
llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
|
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
|
||||||
|
|
||||||
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
||||||
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||||
|
@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
|
||||||
|
|
||||||
// apply grammar checks before sampling logic
|
// apply grammar checks before sampling logic
|
||||||
if (apply_grammar && ctx_sampling->grammar != NULL) {
|
if (apply_grammar && ctx_sampling->grammar != NULL) {
|
||||||
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
|
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur_p;
|
return cur_p;
|
||||||
|
@ -455,6 +455,6 @@ void llama_sampling_accept(
|
||||||
ctx_sampling->prev.push_back(id);
|
ctx_sampling->prev.push_back(id);
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL && apply_grammar) {
|
if (ctx_sampling->grammar != NULL && apply_grammar) {
|
||||||
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
|
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -48,7 +48,7 @@ class Model:
|
||||||
|
|
||||||
dir_model: Path
|
dir_model: Path
|
||||||
ftype: gguf.LlamaFileType
|
ftype: gguf.LlamaFileType
|
||||||
fname_out: Path | None
|
fname_out: Path
|
||||||
is_big_endian: bool
|
is_big_endian: bool
|
||||||
endianess: gguf.GGUFEndian
|
endianess: gguf.GGUFEndian
|
||||||
use_temp_file: bool
|
use_temp_file: bool
|
||||||
|
@ -62,11 +62,12 @@ class Model:
|
||||||
gguf_writer: gguf.GGUFWriter
|
gguf_writer: gguf.GGUFWriter
|
||||||
model_name: str | None
|
model_name: str | None
|
||||||
metadata_override: Path | None
|
metadata_override: Path | None
|
||||||
|
dir_model_card: Path
|
||||||
|
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path | None, is_big_endian: bool = False,
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||||
use_temp_file: bool = False, eager: bool = False,
|
use_temp_file: bool = False, eager: bool = False,
|
||||||
metadata_override: Path | None = None, model_name: str | None = None,
|
metadata_override: Path | None = None, model_name: str | None = None,
|
||||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||||
|
@ -90,6 +91,7 @@ class Model:
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
self.metadata_override = metadata_override
|
self.metadata_override = metadata_override
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
|
@ -237,6 +239,10 @@ class Model:
|
||||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
logger.info(f"gguf: experts used count = {n_experts_used}")
|
logger.info(f"gguf: experts used count = {n_experts_used}")
|
||||||
|
|
||||||
|
if (head_dim := self.hparams.get("head_dim")) is not None:
|
||||||
|
self.gguf_writer.add_key_length(head_dim)
|
||||||
|
self.gguf_writer.add_value_length(head_dim)
|
||||||
|
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
logger.info(f"gguf: file type = {self.ftype}")
|
logger.info(f"gguf: file type = {self.ftype}")
|
||||||
|
|
||||||
|
@ -310,7 +316,7 @@ class Model:
|
||||||
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
||||||
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||||
data = gguf.quantize_bf16(data)
|
data = gguf.quantize_bf16(data)
|
||||||
assert data.dtype == np.int16
|
assert data.dtype == np.uint16
|
||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
|
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
|
||||||
|
@ -345,7 +351,7 @@ class Model:
|
||||||
|
|
||||||
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count()
|
||||||
|
|
||||||
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model, self.model_name, total_params)
|
self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)
|
||||||
|
|
||||||
# Fallback to model directory name if metadata name is still missing
|
# Fallback to model directory name if metadata name is still missing
|
||||||
if self.metadata.name is None:
|
if self.metadata.name is None:
|
||||||
|
@ -359,27 +365,22 @@ class Model:
|
||||||
output_type: str = self.ftype.name.partition("_")[2]
|
output_type: str = self.ftype.name.partition("_")[2]
|
||||||
|
|
||||||
# Filename Output
|
# Filename Output
|
||||||
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
if self.fname_out.is_dir():
|
||||||
# file template strings as it doesn't actually exist as a file
|
|
||||||
if self.fname_out is not None and not self.fname_out.is_dir():
|
|
||||||
# Output path is a custom defined templated filename
|
|
||||||
|
|
||||||
# Process templated file name with the output ftype, useful with the "auto" ftype
|
|
||||||
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
|
||||||
else:
|
|
||||||
# Generate default filename based on model specification and available metadata
|
# Generate default filename based on model specification and available metadata
|
||||||
if not vocab_only:
|
if not vocab_only:
|
||||||
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
|
||||||
else:
|
else:
|
||||||
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
|
||||||
|
|
||||||
# Check if preferred output directory path was provided
|
# Use the default filename
|
||||||
if self.fname_out is not None and self.fname_out.is_dir():
|
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
||||||
# output path is a directory
|
else:
|
||||||
self.fname_out = self.fname_out / f"{fname_default}.gguf"
|
# Output path is a custom defined templated filename
|
||||||
else:
|
# Note: `not is_dir()` is used because `.is_file()` will not detect
|
||||||
# output in the same directory as the model by default
|
# file template strings as it doesn't actually exist as a file
|
||||||
self.fname_out = self.dir_model / f"{fname_default}.gguf"
|
|
||||||
|
# Process templated file name with the output ftype, useful with the "auto" ftype
|
||||||
|
self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
|
||||||
|
|
||||||
self.set_type()
|
self.set_type()
|
||||||
|
|
||||||
|
@ -593,6 +594,15 @@ class Model:
|
||||||
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
|
||||||
# ref: https://huggingface.co/core42/jais-13b
|
# ref: https://huggingface.co/core42/jais-13b
|
||||||
res = "jais"
|
res = "jais"
|
||||||
|
if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
|
||||||
|
# ref: https://huggingface.co/WisdomShell/CodeShell-7B
|
||||||
|
res = "codeshell"
|
||||||
|
if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
|
||||||
|
# ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
|
||||||
|
res = "tekken"
|
||||||
|
if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
|
||||||
|
# ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
|
||||||
|
res = "smollm"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
|
@ -733,7 +743,7 @@ class Model:
|
||||||
added_tokens_json = json.load(f)
|
added_tokens_json = json.load(f)
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -750,7 +760,8 @@ class Model:
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
token: str = token_data["content"]
|
token: str = token_data["content"]
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert tokens[token_id] == token.encode("utf-8")
|
if tokens[token_id] != token.encode("utf-8"):
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
|
||||||
if token_data.get("special") or self.does_token_look_special(token):
|
if token_data.get("special") or self.does_token_look_special(token):
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
|
||||||
else:
|
else:
|
||||||
|
@ -1309,6 +1320,7 @@ class RefactModel(Model):
|
||||||
special_vocab._set_special_token("prefix", 1)
|
special_vocab._set_special_token("prefix", 1)
|
||||||
special_vocab._set_special_token("suffix", 3)
|
special_vocab._set_special_token("suffix", 3)
|
||||||
special_vocab._set_special_token("middle", 2)
|
special_vocab._set_special_token("middle", 2)
|
||||||
|
special_vocab.chat_template = None # do not add it twice
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
|
@ -1479,7 +1491,12 @@ class LlamaModel(Model):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
|
||||||
|
if "head_dim" in hparams:
|
||||||
|
rope_dim = hparams["head_dim"]
|
||||||
|
else:
|
||||||
|
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
|
||||||
|
self.gguf_writer.add_rope_dimension_count(rope_dim)
|
||||||
|
|
||||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||||
if self.hparams["rope_scaling"].get("type") == "linear":
|
if self.hparams["rope_scaling"].get("type") == "linear":
|
||||||
|
@ -1553,6 +1570,34 @@ class LlamaModel(Model):
|
||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
def prepare_tensors(self):
|
def prepare_tensors(self):
|
||||||
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
|
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||||
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
|
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
|
||||||
|
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
|
||||||
|
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
|
||||||
|
|
||||||
|
low_freq_wavelen = old_context_len / low_freq_factor
|
||||||
|
high_freq_wavelen = old_context_len / high_freq_factor
|
||||||
|
assert low_freq_wavelen != high_freq_wavelen
|
||||||
|
|
||||||
|
rope_factors = []
|
||||||
|
for freq in freqs:
|
||||||
|
wavelen = 2 * math.pi / freq
|
||||||
|
if wavelen < high_freq_wavelen:
|
||||||
|
rope_factors.append(1)
|
||||||
|
elif wavelen > low_freq_wavelen:
|
||||||
|
rope_factors.append(factor)
|
||||||
|
else:
|
||||||
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||||
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||||
|
|
||||||
super().prepare_tensors()
|
super().prepare_tensors()
|
||||||
|
|
||||||
if self._experts is not None:
|
if self._experts is not None:
|
||||||
|
@ -1994,7 +2039,7 @@ class Phi3MiniModel(Model):
|
||||||
|
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -2011,7 +2056,8 @@ class Phi3MiniModel(Model):
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
token = foken_data["content"].encode("utf-8")
|
token = foken_data["content"].encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert tokens[token_id] == token
|
if tokens[token_id] != token:
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -2027,7 +2073,8 @@ class Phi3MiniModel(Model):
|
||||||
token_id = int(foken_data["id"])
|
token_id = int(foken_data["id"])
|
||||||
token = foken_data["content"].encode("utf-8")
|
token = foken_data["content"].encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert tokens[token_id] == token
|
if tokens[token_id] != token:
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -2065,10 +2112,11 @@ class Phi3MiniModel(Model):
|
||||||
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
self.gguf_writer.add_rope_dimension_count(rope_dims)
|
||||||
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
|
||||||
|
|
||||||
# write rope scaling for long context (128k) model
|
# write rope scaling for long context (128k) model
|
||||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||||
if (rope_scaling is None):
|
if rope_scaling is None:
|
||||||
return
|
return
|
||||||
|
|
||||||
scale = max_pos_embds / orig_max_pos_embds
|
scale = max_pos_embds / orig_max_pos_embds
|
||||||
|
@ -2266,7 +2314,8 @@ class InternLM2Model(Model):
|
||||||
chat_eos_token_id = token_id
|
chat_eos_token_id = token_id
|
||||||
token = token.encode("utf-8")
|
token = token.encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert(tokens[token_id] == token)
|
if tokens[token_id] != token:
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -2285,7 +2334,8 @@ class InternLM2Model(Model):
|
||||||
chat_eos_token_id = token_id
|
chat_eos_token_id = token_id
|
||||||
token = token.encode("utf-8")
|
token = token.encode("utf-8")
|
||||||
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
|
||||||
assert(tokens[token_id] == token)
|
if tokens[token_id] != token:
|
||||||
|
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
|
||||||
tokens[token_id] = token
|
tokens[token_id] = token
|
||||||
scores[token_id] = -1000.0
|
scores[token_id] = -1000.0
|
||||||
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
@ -2471,6 +2521,7 @@ class GemmaModel(Model):
|
||||||
special_vocab._set_special_token("middle", 68)
|
special_vocab._set_special_token("middle", 68)
|
||||||
special_vocab._set_special_token("fsep", 70)
|
special_vocab._set_special_token("fsep", 70)
|
||||||
special_vocab._set_special_token("eot", 107)
|
special_vocab._set_special_token("eot", 107)
|
||||||
|
special_vocab.chat_template = None # do not add it twice
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
self.gguf_writer.add_add_space_prefix(False)
|
self.gguf_writer.add_add_space_prefix(False)
|
||||||
|
@ -2712,7 +2763,7 @@ class JinaBertV2Model(BertModel):
|
||||||
|
|
||||||
yield name, data
|
yield name, data
|
||||||
|
|
||||||
def set_vocab(self, *args, **kwargs):
|
def set_vocab(self):
|
||||||
tokenizer_class = 'BertTokenizer'
|
tokenizer_class = 'BertTokenizer'
|
||||||
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
|
||||||
tokenizer_class = json.load(f)['tokenizer_class']
|
tokenizer_class = json.load(f)['tokenizer_class']
|
||||||
|
@ -2860,7 +2911,7 @@ class ArcticModel(Model):
|
||||||
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||||
for token_id, token_json in added_tokens_decoder.items():
|
for token_id, token_json in added_tokens_decoder.items():
|
||||||
token_id = int(token_id)
|
token_id = int(token_id)
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -3109,7 +3160,7 @@ class T5Model(Model):
|
||||||
added_tokens_json = json.load(f)
|
added_tokens_json = json.load(f)
|
||||||
for key in added_tokens_json:
|
for key in added_tokens_json:
|
||||||
token_id = added_tokens_json[key]
|
token_id = added_tokens_json[key]
|
||||||
if (token_id >= vocab_size):
|
if token_id >= vocab_size:
|
||||||
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -3624,10 +3675,10 @@ def main() -> None:
|
||||||
logger.error("Error: Cannot use temp file when splitting")
|
logger.error("Error: Cannot use temp file when splitting")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
fname_out = None
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
|
else:
|
||||||
|
fname_out = dir_model
|
||||||
|
|
||||||
logger.info(f"Loading model: {dir_model.name}")
|
logger.info(f"Loading model: {dir_model.name}")
|
||||||
|
|
||||||
|
@ -3658,7 +3709,6 @@ def main() -> None:
|
||||||
else:
|
else:
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
assert model_instance.fname_out is not None
|
|
||||||
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
|
||||||
logger.info(f"Model successfully exported to {out_path}")
|
logger.info(f"Model successfully exported to {out_path}")
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum):
|
||||||
|
|
||||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||||
# will be updated with time - contributions welcome
|
# will be updated with time - contributions welcome
|
||||||
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||||
|
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
token = sys.argv[1]
|
token = sys.argv[1]
|
||||||
|
@ -91,6 +91,9 @@ models = [
|
||||||
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
|
||||||
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
|
||||||
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
|
||||||
|
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
|
||||||
|
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
|
||||||
|
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path):
|
||||||
response = sess.get(url, headers=headers)
|
response = sess.get(url, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||||
with open(save_path, 'wb') as f:
|
with open(save_path, 'wb') as downloaded_file:
|
||||||
f.write(response.content)
|
downloaded_file.write(response.content)
|
||||||
logger.info(f"File {save_path} downloaded successfully")
|
logger.info(f"File {save_path} downloaded successfully")
|
||||||
|
|
||||||
|
|
||||||
|
@ -159,7 +162,7 @@ for model in models:
|
||||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
||||||
continue # Skip to the next model if the tokenizer can't be loaded
|
continue # Skip to the next model if the tokenizer can't be loaded
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(CHK_TXT)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
|
||||||
logger.info(f"model: {name}")
|
logger.info(f"model: {name}")
|
||||||
|
@ -191,7 +194,7 @@ src_func = f"""
|
||||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||||
# use in llama.cpp to implement the same pre-tokenizer
|
# use in llama.cpp to implement the same pre-tokenizer
|
||||||
|
|
||||||
chktxt = {repr(chktxt)}
|
chktxt = {repr(CHK_TXT)}
|
||||||
|
|
||||||
chktok = tokenizer.encode(chktxt)
|
chktok = tokenizer.encode(chktxt)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
@ -287,7 +290,7 @@ tests = [
|
||||||
"333333333",
|
"333333333",
|
||||||
"Cửa Việt", # llama-bpe fails on this
|
"Cửa Việt", # llama-bpe fails on this
|
||||||
" discards",
|
" discards",
|
||||||
chktxt,
|
CHK_TXT,
|
||||||
]
|
]
|
||||||
|
|
||||||
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
|
||||||
|
|
|
@ -132,6 +132,10 @@ class Tensor:
|
||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
|
|
||||||
|
file_format: GGMLFormat
|
||||||
|
format_version: int
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
self.vocab = None
|
self.vocab = None
|
||||||
|
@ -290,7 +294,7 @@ class GGMLToGGUF:
|
||||||
if self.vocab_override is not None:
|
if self.vocab_override is not None:
|
||||||
vo = self.vocab_override
|
vo = self.vocab_override
|
||||||
logger.info('* Adding vocab item(s)')
|
logger.info('* Adding vocab item(s)')
|
||||||
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
|
||||||
tokens.append(vbytes)
|
tokens.append(vbytes)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(ttype)
|
toktypes.append(ttype)
|
||||||
|
|
|
@ -290,7 +290,7 @@ if __name__ == '__main__':
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
# output in the same directory as the model by default
|
# output in the same directory as the model by default
|
||||||
fname_out = dir_lora / 'ggml-lora-{ftype}.gguf'
|
fname_out = dir_lora
|
||||||
|
|
||||||
if os.path.exists(input_model):
|
if os.path.exists(input_model):
|
||||||
# lazy import load_file only if lora is in safetensors format.
|
# lazy import load_file only if lora is in safetensors format.
|
||||||
|
@ -304,12 +304,6 @@ if __name__ == '__main__':
|
||||||
# load base model
|
# load base model
|
||||||
logger.info(f"Loading base model: {dir_base_model.name}")
|
logger.info(f"Loading base model: {dir_base_model.name}")
|
||||||
hparams = Model.load_hparams(dir_base_model)
|
hparams = Model.load_hparams(dir_base_model)
|
||||||
|
|
||||||
with open(lora_config, "r") as f:
|
|
||||||
lparams: dict[str, Any] = json.load(f)
|
|
||||||
|
|
||||||
alpha: float = lparams["lora_alpha"]
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
try:
|
try:
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
@ -320,12 +314,21 @@ if __name__ == '__main__':
|
||||||
class LoraModel(model_class):
|
class LoraModel(model_class):
|
||||||
model_arch = model_class.model_arch
|
model_arch = model_class.model_arch
|
||||||
|
|
||||||
|
lora_alpha: float
|
||||||
|
|
||||||
|
def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs):
|
||||||
|
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
self.dir_model_card = dir_lora_model
|
||||||
|
self.lora_alpha = float(lora_alpha)
|
||||||
|
|
||||||
def set_type(self):
|
def set_type(self):
|
||||||
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
|
||||||
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha))
|
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
|
||||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
|
@ -368,6 +371,11 @@ if __name__ == '__main__':
|
||||||
yield (dest_name + ".lora_a", lora_a)
|
yield (dest_name + ".lora_a", lora_a)
|
||||||
yield (dest_name + ".lora_b", lora_b)
|
yield (dest_name + ".lora_b", lora_b)
|
||||||
|
|
||||||
|
with open(lora_config, "r") as f:
|
||||||
|
lparams: dict[str, Any] = json.load(f)
|
||||||
|
|
||||||
|
alpha: float = lparams["lora_alpha"]
|
||||||
|
|
||||||
model_instance = LoraModel(
|
model_instance = LoraModel(
|
||||||
dir_base_model,
|
dir_base_model,
|
||||||
ftype,
|
ftype,
|
||||||
|
@ -376,6 +384,8 @@ if __name__ == '__main__':
|
||||||
use_temp_file=False,
|
use_temp_file=False,
|
||||||
eager=args.no_lazy,
|
eager=args.no_lazy,
|
||||||
dry_run=args.dry_run,
|
dry_run=args.dry_run,
|
||||||
|
dir_lora_model=dir_lora,
|
||||||
|
lora_alpha=alpha,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
|
|
|
@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
|
||||||
```sh
|
```sh
|
||||||
./build/bin/llama-ls-sycl-device
|
./build/bin/llama-ls-sycl-device
|
||||||
```
|
```
|
||||||
A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
|
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||||
```
|
```
|
||||||
found 6 SYCL devices:
|
found 2 SYCL devices:
|
||||||
|
|
||||||
| | | |Compute |Max compute|Max work|Max sub| |
|
| | | |Compute |Max compute|Max work|Max sub| |
|
||||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
|
||||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
|
||||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
|
||||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
|
||||||
```
|
```
|
||||||
|
|
||||||
| Attribute | Note |
|
|
||||||
|------------------------|-------------------------------------------------------------|
|
|
||||||
| compute capability 1.3 | Level-zero driver/runtime, recommended |
|
|
||||||
| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
|
|
||||||
|
|
||||||
4. Launch inference
|
4. Launch inference
|
||||||
|
|
||||||
There are two device selection modes:
|
There are two device selection modes:
|
||||||
|
|
||||||
- Single device: Use one device target specified by the user.
|
- Single device: Use one device target specified by the user.
|
||||||
- Multiple devices: Automatically select the devices with the same largest Max compute-units.
|
- Multiple devices: Automatically choose the devices with the same backend.
|
||||||
|
|
||||||
|
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
|
||||||
|
|
||||||
| Device selection | Parameter |
|
| Device selection | Parameter |
|
||||||
|------------------|----------------------------------------|
|
|------------------|----------------------------------------|
|
||||||
|
@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
|
||||||
build\bin\ls-sycl-device.exe
|
build\bin\ls-sycl-device.exe
|
||||||
```
|
```
|
||||||
|
|
||||||
The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
|
This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
|
||||||
```
|
```
|
||||||
found 6 SYCL devices:
|
found 2 SYCL devices:
|
||||||
| | | |Compute |Max compute|Max work|Max sub| |
|
| | | |Compute |Max compute|Max work|Max sub| |
|
||||||
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
|ID| Device Type| Name|capability|units |group |group |Global mem size|
|
||||||
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
|
||||||
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
|
||||||
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
|
||||||
| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
|
|
||||||
| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
|
|
||||||
| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
|
|
||||||
| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
| Attribute | Note |
|
|
||||||
|------------------------|-----------------------------------------------------------|
|
|
||||||
| compute capability 1.3 | Level-zero running time, recommended |
|
|
||||||
| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
|
|
||||||
|
|
||||||
|
|
||||||
4. Launch inference
|
4. Launch inference
|
||||||
|
|
||||||
There are two device selection modes:
|
There are two device selection modes:
|
||||||
|
|
||||||
- Single device: Use one device assigned by user.
|
- Single device: Use one device assigned by user. Default device id is 0.
|
||||||
- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
|
- Multiple devices: Automatically choose the devices with the same backend.
|
||||||
|
|
||||||
|
In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
|
||||||
|
|
||||||
| Device selection | Parameter |
|
| Device selection | Parameter |
|
||||||
|------------------|----------------------------------------|
|
|------------------|----------------------------------------|
|
||||||
|
|
|
@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
- On Windows:
|
- On Windows (x86/x64 only, arm64 requires cmake):
|
||||||
|
|
||||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||||
2. Extract `w64devkit` on your pc.
|
2. Extract `w64devkit` on your pc.
|
||||||
|
@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options.
|
||||||
cmake -B build -G "Xcode"
|
cmake -B build -G "Xcode"
|
||||||
cmake --build build --config Debug
|
cmake --build build --config Debug
|
||||||
```
|
```
|
||||||
|
- Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers:
|
||||||
|
- Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...):
|
||||||
|
- Tab Workload: Desktop-development with C++
|
||||||
|
- Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang)
|
||||||
|
- Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test
|
||||||
|
- For Windows on ARM (arm64, WoA) build with:
|
||||||
|
```bash
|
||||||
|
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
|
||||||
|
cmake --build build-arm64-windows-llvm-release
|
||||||
|
```
|
||||||
|
Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
|
||||||
|
|
||||||
- Using `gmake` (FreeBSD):
|
- Using `gmake` (FreeBSD):
|
||||||
|
|
||||||
|
@ -167,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
||||||
|
|
||||||
|
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
||||||
|
|
||||||
|
The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
@ -181,6 +196,19 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c
|
||||||
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
||||||
|
|
||||||
|
### MUSA
|
||||||
|
|
||||||
|
- Using `make`:
|
||||||
|
```bash
|
||||||
|
make GGML_MUSA=1
|
||||||
|
```
|
||||||
|
- Using `CMake`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -B build -DGGML_MUSA=ON
|
||||||
|
cmake --build build --config Release
|
||||||
|
```
|
||||||
|
|
||||||
### hipBLAS
|
### hipBLAS
|
||||||
|
|
||||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||||
|
|
|
@ -21,7 +21,6 @@ else()
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
add_subdirectory(export-lora)
|
add_subdirectory(export-lora)
|
||||||
add_subdirectory(finetune)
|
|
||||||
add_subdirectory(gbnf-validator)
|
add_subdirectory(gbnf-validator)
|
||||||
add_subdirectory(gguf-hash)
|
add_subdirectory(gguf-hash)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
|
@ -53,5 +52,4 @@ else()
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(tokenize)
|
add_subdirectory(tokenize)
|
||||||
add_subdirectory(train-text-from-scratch)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "train.h"
|
#include "train.h"
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
|
|
@ -69,7 +69,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
// ensure enough sequences are available
|
// ensure enough sequences are available
|
||||||
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names.
|
||||||
| server | llama-server |
|
| server | llama-server |
|
||||||
| llama-bench | llama-bench |
|
| llama-bench | llama-bench |
|
||||||
| embedding | llama-embedding |
|
| embedding | llama-embedding |
|
||||||
| finetune | llama-finetune |
|
|
||||||
| quantize | llama-quantize |
|
| quantize | llama-quantize |
|
||||||
| tokenize | llama-tokenize |
|
| tokenize | llama-tokenize |
|
||||||
| export-lora | llama-export-lora |
|
| export-lora | llama-export-lora |
|
||||||
|
@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names.
|
||||||
| save-load-state | llama-save-load-state |
|
| save-load-state | llama-save-load-state |
|
||||||
| simple | llama-simple |
|
| simple | llama-simple |
|
||||||
| speculative | llama-speculative |
|
| speculative | llama-speculative |
|
||||||
| train-text-from-scratch | llama-train-text-from-scratch |
|
|
||||||
| vdot | llama-vdot |
|
| vdot | llama-vdot |
|
||||||
| tests/test-c.o | tests/test-c.o |
|
| tests/test-c.o | tests/test-c.o |
|
||||||
|
|
||||||
|
|
|
@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||||
} else if (type == GGML_TYPE_I8) {
|
} else if (type == GGML_TYPE_I8) {
|
||||||
v = (float) *(int8_t *) &data[i];
|
v = (float) *(int8_t *) &data[i];
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
printf("%12.4f", v);
|
printf("%12.4f", v);
|
||||||
sum += v;
|
sum += v;
|
||||||
|
|
|
@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model.
|
||||||
usage: llama-export-lora [options]
|
usage: llama-export-lora [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help show this help message and exit
|
-m, --model model path from which to load base model (default '')
|
||||||
-m FNAME, --model-base FNAME model path from which to load base model (default '')
|
--lora FNAME path to LoRA adapter (can be repeated to use multiple adapters)
|
||||||
-o FNAME, --model-out FNAME path to save exported model (default '')
|
--lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)
|
||||||
-l FNAME, --lora FNAME apply LoRA adapter
|
-t, --threads N number of threads to use during computation (default: 4)
|
||||||
-s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S
|
-o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf')
|
||||||
-t N, --threads N number of threads to use during computation (default: 4)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
@ -20,7 +19,15 @@ For example:
|
||||||
./bin/llama-export-lora \
|
./bin/llama-export-lora \
|
||||||
-m open-llama-3b-v2-q8_0.gguf \
|
-m open-llama-3b-v2-q8_0.gguf \
|
||||||
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
||||||
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
--lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
|
Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./bin/llama-export-lora \
|
||||||
|
-m your_base_model.gguf \
|
||||||
|
-o your_merged_model.gguf \
|
||||||
|
--lora-scaled lora_task_A.gguf 0.5 \
|
||||||
|
--lora-scaled lora_task_B.gguf 0.5
|
||||||
|
```
|
||||||
|
|
|
@ -1,465 +1,420 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
|
||||||
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
struct lora_info {
|
static bool g_verbose = false;
|
||||||
std::string filename;
|
|
||||||
|
static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
|
||||||
|
int id = gguf_find_key(ctx_gguf, key.c_str());
|
||||||
|
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
|
||||||
|
}
|
||||||
|
|
||||||
|
static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
|
||||||
|
int id = gguf_find_key(ctx_gguf, key.c_str());
|
||||||
|
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void zeros(std::ofstream & file, size_t n) {
|
||||||
|
char zero = 0;
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
file.write(&zero, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string ggml_ne_string(const ggml_tensor * t) {
|
||||||
|
std::string str;
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||||
|
str += std::to_string(t->ne[i]);
|
||||||
|
if (i + 1 < GGML_MAX_DIMS) {
|
||||||
|
str += ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
|
||||||
|
struct gguf_init_params params = {
|
||||||
|
/*.no_alloc = */ true,
|
||||||
|
/*.ctx = */ ctx_ggml,
|
||||||
|
};
|
||||||
|
struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
||||||
|
if (!ctx_gguf) {
|
||||||
|
throw std::runtime_error("failed to load input GGUF from " + fname);
|
||||||
|
}
|
||||||
|
return ctx_gguf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
|
std::string result;
|
||||||
|
for (size_t pos = 0; ; pos += search.length()) {
|
||||||
|
auto new_pos = s.find(search, pos);
|
||||||
|
if (new_pos == std::string::npos) {
|
||||||
|
result += s.substr(pos, s.size() - pos);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result += s.substr(pos, new_pos - pos) + replace;
|
||||||
|
pos = new_pos;
|
||||||
|
}
|
||||||
|
s = std::move(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct file_input {
|
||||||
|
struct ggml_context * ctx_meta = nullptr;
|
||||||
|
struct gguf_context * ctx_gguf = nullptr;
|
||||||
|
std::ifstream f_in;
|
||||||
|
std::map<std::string, ggml_tensor *> tensors;
|
||||||
|
float alpha;
|
||||||
float scale;
|
float scale;
|
||||||
|
|
||||||
|
file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
|
||||||
|
if (!f_in.is_open()) {
|
||||||
|
throw std::runtime_error("failed to open input gguf from " + fname);
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx_gguf = load_gguf(fname, &ctx_meta);
|
||||||
|
alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
|
||||||
|
printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
|
||||||
|
std::string name(cur->name);
|
||||||
|
tensors[name] = cur;
|
||||||
|
if (g_verbose) {
|
||||||
|
printf("%s: %s\n", __func__, cur->name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * get_tensor(std::string name) {
|
||||||
|
if (tensors.find(name) == tensors.end()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return tensors[name];
|
||||||
|
}
|
||||||
|
|
||||||
|
void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
|
||||||
|
if (tensors.find(name) == tensors.end()) {
|
||||||
|
throw std::runtime_error("cannot find tensor with name: " + name);
|
||||||
|
}
|
||||||
|
auto len = ggml_nbytes(tensors[name]);
|
||||||
|
if (buf.size() < len) {
|
||||||
|
buf.resize(len);
|
||||||
|
}
|
||||||
|
auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
|
||||||
|
auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
|
||||||
|
f_in.seekg(offset);
|
||||||
|
f_in.read((char* )buf.data(), len);
|
||||||
|
}
|
||||||
|
|
||||||
|
~file_input() {
|
||||||
|
gguf_free(ctx_gguf);
|
||||||
|
ggml_free(ctx_meta);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct export_lora_params {
|
struct lora_merge_ctx {
|
||||||
std::string fn_model_base;
|
// input base model + adapters
|
||||||
std::string fn_model_out;
|
file_input base_model;
|
||||||
std::vector<struct lora_info> lora;
|
std::vector<std::unique_ptr<file_input>> adapters;
|
||||||
|
|
||||||
|
// for computing merged tensor
|
||||||
int n_threads;
|
int n_threads;
|
||||||
};
|
ggml_backend_t backend = nullptr;
|
||||||
|
ggml_gallocr_t allocr = nullptr;
|
||||||
|
std::vector<uint8_t> read_buf;
|
||||||
|
|
||||||
struct lora_data {
|
// output file
|
||||||
struct lora_info info;
|
struct gguf_context * ctx_out;
|
||||||
std::vector<uint8_t> data;
|
struct ggml_context * ctx_out_ggml;
|
||||||
struct ggml_context * ctx;
|
std::ofstream fout;
|
||||||
|
|
||||||
uint32_t lora_r;
|
lora_merge_ctx(
|
||||||
uint32_t lora_alpha;
|
std::string & base_fname,
|
||||||
};
|
std::vector<std::tuple<std::string, float>> & lora_files,
|
||||||
|
std::string & outfile,
|
||||||
|
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
||||||
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
|
|
||||||
struct llama_file {
|
if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
|
||||||
// use FILE * so we don't have to re-open the file to mmap
|
throw std::runtime_error("split model is not yet supported");
|
||||||
FILE * fp;
|
}
|
||||||
size_t size;
|
|
||||||
|
|
||||||
llama_file(const char * fname, const char * mode) {
|
for (auto lora_inp : lora_files) {
|
||||||
fp = std::fopen(fname, mode);
|
auto fname = std::get<0>(lora_inp);
|
||||||
if (fp == NULL) {
|
auto scale = std::get<1>(lora_inp);
|
||||||
size = 0;
|
std::unique_ptr<file_input> adapter(new file_input(fname, scale));
|
||||||
|
check_metadata_lora(adapter.get());
|
||||||
|
adapters.push_back(std::move(adapter));
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx_out = gguf_init_empty();
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ctx_out_ggml = ggml_init(params);
|
||||||
|
backend = ggml_backend_cpu_init();
|
||||||
|
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
|
||||||
|
}
|
||||||
|
|
||||||
|
void check_metadata_lora(file_input * adapter) {
|
||||||
|
auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
|
||||||
|
if (general_type != "adapter") {
|
||||||
|
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
|
||||||
|
if (adapter_type != "lora") {
|
||||||
|
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
|
||||||
|
auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture");
|
||||||
|
if (general_arch_base != general_arch_lora) {
|
||||||
|
throw std::runtime_error("model arch and LoRA arch mismatch");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_type get_out_tensor_type(struct ggml_tensor * t) {
|
||||||
|
if (t->type == GGML_TYPE_F32) {
|
||||||
|
return GGML_TYPE_F32;
|
||||||
} else {
|
} else {
|
||||||
seek(0, SEEK_END);
|
return GGML_TYPE_F16;
|
||||||
size = tell();
|
|
||||||
seek(0, SEEK_SET);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t tell() const {
|
void run_merge() {
|
||||||
#ifdef _WIN32
|
// prepare metadata
|
||||||
__int64 ret = _ftelli64(fp);
|
gguf_set_kv(ctx_out, base_model.ctx_gguf);
|
||||||
#else
|
// output is forced to f16 for now
|
||||||
long ret = std::ftell(fp);
|
gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
|
||||||
#endif
|
|
||||||
GGML_ASSERT(ret != -1); // this really shouldn't fail
|
|
||||||
return (size_t) ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void seek(size_t offset, int whence) {
|
// check if all lora adapters have the same tensors
|
||||||
#ifdef _WIN32
|
// TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
|
||||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
|
||||||
#else
|
if (adapters.size() > 1) {
|
||||||
int ret = std::fseek(fp, (long) offset, whence);
|
for (size_t i = 1; i < adapters.size(); ++i) {
|
||||||
#endif
|
if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
|
||||||
GGML_ASSERT(ret == 0); // same
|
throw std::runtime_error(err_no_subset_adapter);
|
||||||
}
|
}
|
||||||
|
for (auto & it : adapters[i]->tensors) {
|
||||||
void read_raw(void * ptr, size_t size) {
|
if (adapters[0]->get_tensor(it.first) == nullptr) {
|
||||||
if (size == 0) {
|
throw std::runtime_error(err_no_subset_adapter);
|
||||||
return;
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
errno = 0;
|
|
||||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
// mapping base tensor to out tensor (same shape with base, but different type)
|
||||||
if (ferror(fp)) {
|
// if out_tensor == nullptr, we only copy it
|
||||||
die_fmt("read error: %s", strerror(errno));
|
std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
|
||||||
|
for (auto & it : base_model.tensors) {
|
||||||
|
bool t_a = true;
|
||||||
|
bool t_b = true;
|
||||||
|
for (auto & adapter : adapters) {
|
||||||
|
t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
|
||||||
|
t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
|
||||||
|
}
|
||||||
|
auto base_tensor = it.second;
|
||||||
|
if (!t_a && !t_b) {
|
||||||
|
// only copy
|
||||||
|
struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
|
||||||
|
ggml_set_name(cpy_tensor, base_tensor->name);
|
||||||
|
base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
|
||||||
|
gguf_add_tensor(ctx_out, cpy_tensor);
|
||||||
|
} else if (t_a && t_b) {
|
||||||
|
// need merging
|
||||||
|
struct ggml_tensor * out_tensor = ggml_new_tensor(
|
||||||
|
ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
|
||||||
|
ggml_set_name(out_tensor, base_tensor->name);
|
||||||
|
base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
|
||||||
|
gguf_add_tensor(ctx_out, out_tensor);
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (ret != 1) {
|
|
||||||
die("unexpectedly reached end of file");
|
// placeholder for the meta data
|
||||||
|
{
|
||||||
|
size_t meta_size = gguf_get_meta_size(ctx_out);
|
||||||
|
zeros(fout, meta_size);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
std::uint32_t read_u32() {
|
// process base model tensors
|
||||||
std::uint32_t ret;
|
size_t n_merged = 0;
|
||||||
read_raw(&ret, sizeof(ret));
|
for (auto & it : base_to_out_tensors) {
|
||||||
return ret;
|
if (it.second != nullptr) {
|
||||||
}
|
merge_tensor(it.first, it.second);
|
||||||
|
n_merged++;
|
||||||
std::string read_string(std::uint32_t len) {
|
} else {
|
||||||
std::vector<char> chars(len);
|
copy_tensor(it.first);
|
||||||
read_raw(chars.data(), len);
|
}
|
||||||
return std::string(chars.data(), len);
|
|
||||||
}
|
|
||||||
|
|
||||||
void write_raw(const void * ptr, size_t size) {
|
|
||||||
if (size == 0) {
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
errno = 0;
|
|
||||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
// write output metadata
|
||||||
if (ret != 1) {
|
{
|
||||||
die_fmt("write error: %s", strerror(errno));
|
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
|
||||||
|
gguf_get_meta_data(ctx_out, data.data());
|
||||||
|
fout.seekp(0);
|
||||||
|
fout.write((const char *)data.data(), data.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
|
||||||
|
printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_u32(std::uint32_t val) {
|
void copy_tensor(struct ggml_tensor * base) {
|
||||||
write_raw(&val, sizeof(val));
|
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
|
||||||
|
size_t len = ggml_nbytes(base);
|
||||||
|
base_model.read_tensor_data(base->name, read_buf);
|
||||||
|
fout.write((char* )read_buf.data(), len);
|
||||||
|
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool eof() {
|
void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
|
||||||
return tell() >= size;
|
std::string name_base(base->name);
|
||||||
}
|
std::string name_lora_a = name_base + ".lora_a";
|
||||||
|
std::string name_lora_b = name_base + ".lora_b";
|
||||||
|
|
||||||
~llama_file() {
|
printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
|
||||||
if (fp) {
|
|
||||||
std::fclose(fp);
|
// context for input tensor
|
||||||
|
std::vector<struct ggml_tensor *> inp_a(adapters.size());
|
||||||
|
std::vector<struct ggml_tensor *> inp_b(adapters.size());
|
||||||
|
struct ggml_init_params params {
|
||||||
|
/*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
|
||||||
|
// alloc tensors
|
||||||
|
struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
|
||||||
|
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||||
|
auto t_a = adapters[i]->get_tensor(name_lora_a);
|
||||||
|
auto t_b = adapters[i]->get_tensor(name_lora_b);
|
||||||
|
inp_a[i] = ggml_dup_tensor(ctx, t_a);
|
||||||
|
inp_b[i] = ggml_dup_tensor(ctx, t_b);
|
||||||
}
|
}
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||||
|
|
||||||
|
// load base tensor to backend buffer
|
||||||
|
base_model.read_tensor_data(name_base, read_buf);
|
||||||
|
if (base->type != GGML_TYPE_F32) {
|
||||||
|
// optionally dequantize it
|
||||||
|
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
|
||||||
|
auto nels = ggml_nelements(inp_base);
|
||||||
|
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
|
||||||
|
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
||||||
|
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
||||||
|
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
||||||
|
} else {
|
||||||
|
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
||||||
|
}
|
||||||
|
|
||||||
|
// load lora tensors to backend buffer
|
||||||
|
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||||
|
adapters[i]->read_tensor_data(name_lora_a, read_buf);
|
||||||
|
ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
|
||||||
|
adapters[i]->read_tensor_data(name_lora_b, read_buf);
|
||||||
|
ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// build graph
|
||||||
|
struct ggml_cgraph * gf;
|
||||||
|
{
|
||||||
|
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
|
||||||
|
static std::vector<uint8_t> buf(buf_size);
|
||||||
|
struct ggml_init_params params0 = {
|
||||||
|
/*.mem_size =*/ buf_size,
|
||||||
|
/*.mem_buffer =*/ buf.data(),
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params0);
|
||||||
|
gf = ggml_new_graph(ctx0);
|
||||||
|
struct ggml_tensor * cur = inp_base;
|
||||||
|
for (size_t i = 0; i < adapters.size(); ++i) {
|
||||||
|
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
|
||||||
|
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
|
||||||
|
// scale
|
||||||
|
const float alpha = adapters[i]->alpha;
|
||||||
|
const float rank = (float) inp_b[i]->ne[0];
|
||||||
|
const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
|
||||||
|
delta = ggml_scale(ctx0, delta, scale);
|
||||||
|
cur = ggml_add(ctx0, delta, cur);
|
||||||
|
printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
|
||||||
|
printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
|
||||||
|
}
|
||||||
|
cur = ggml_cast(ctx0, cur, out->type);
|
||||||
|
printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type));
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
ggml_free(ctx0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute
|
||||||
|
{
|
||||||
|
ggml_gallocr_alloc_graph(allocr, gf);
|
||||||
|
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
||||||
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write data to output file
|
||||||
|
{
|
||||||
|
auto result = gf->nodes[gf->n_nodes - 1];
|
||||||
|
size_t len = ggml_nbytes(result);
|
||||||
|
if (read_buf.size() < len) {
|
||||||
|
read_buf.resize(len);
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_get(result, read_buf.data(), 0, len);
|
||||||
|
fout.write((char* )read_buf.data(), len);
|
||||||
|
zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_free(ctx);
|
||||||
|
ggml_backend_buffer_free(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
~lora_merge_ctx() {
|
||||||
|
ggml_gallocr_free(allocr);
|
||||||
|
ggml_backend_free(backend);
|
||||||
|
gguf_free(ctx_out);
|
||||||
|
ggml_free(ctx_out_ggml);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct export_lora_params get_default_export_lora_params() {
|
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
struct export_lora_params result;
|
gpt_params_print_usage(argc, argv, params);
|
||||||
result.fn_model_base = "";
|
|
||||||
result.fn_model_out = "";
|
|
||||||
result.n_threads = GGML_DEFAULT_N_THREADS;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
|
printf("\nexample usage:\n");
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
|
||||||
fprintf(stderr, "\n");
|
printf("\nNOTE: output model is F16\n");
|
||||||
fprintf(stderr, "options:\n");
|
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
||||||
fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
|
|
||||||
fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str());
|
|
||||||
fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n");
|
|
||||||
fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n");
|
|
||||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
|
|
||||||
bool invalid_param = false;
|
|
||||||
std::string arg;
|
|
||||||
struct export_lora_params default_params = get_default_export_lora_params();
|
|
||||||
const std::string arg_prefix = "--";
|
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
arg = argv[i];
|
|
||||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
||||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (arg == "-m" || arg == "--model-base") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params->fn_model_base = argv[i];
|
|
||||||
} else if (arg == "-o" || arg == "--model-out") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params->fn_model_out = argv[i];
|
|
||||||
} else if (arg == "-l" || arg == "--lora") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
struct lora_info lora;
|
|
||||||
lora.filename = argv[i];
|
|
||||||
lora.scale = 1.0f;
|
|
||||||
params->lora.push_back(lora);
|
|
||||||
} else if (arg == "-s" || arg == "--lora-scaled") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
struct lora_info lora;
|
|
||||||
lora.filename = argv[i];
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
lora.scale = std::stof(argv[i]);
|
|
||||||
params->lora.push_back(lora);
|
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params->n_threads = std::stoi(argv[i]);
|
|
||||||
if (params->n_threads <= 0) {
|
|
||||||
params->n_threads = std::thread::hardware_concurrency();
|
|
||||||
}
|
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
|
||||||
export_lora_print_usage(argc, argv, &default_params);
|
|
||||||
exit(0);
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
|
|
||||||
export_lora_print_usage(argc, argv, &default_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params->fn_model_base == default_params.fn_model_base) {
|
|
||||||
fprintf(stderr, "error: please specify a filename for model-base.\n");
|
|
||||||
export_lora_print_usage(argc, argv, &default_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (params->fn_model_out == default_params.fn_model_out) {
|
|
||||||
fprintf(stderr, "error: please specify a filename for model-out.\n");
|
|
||||||
export_lora_print_usage(argc, argv, &default_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (invalid_param) {
|
|
||||||
fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
|
|
||||||
export_lora_print_usage(argc, argv, &default_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void free_lora(struct lora_data * lora) {
|
|
||||||
if (lora->ctx != NULL) {
|
|
||||||
ggml_free(lora->ctx);
|
|
||||||
}
|
|
||||||
delete lora;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct lora_data * load_lora(struct lora_info * info) {
|
|
||||||
struct lora_data * result = new struct lora_data;
|
|
||||||
result->info = *info;
|
|
||||||
result->ctx = NULL;
|
|
||||||
result->lora_r = 1;
|
|
||||||
result->lora_alpha = 1;
|
|
||||||
|
|
||||||
struct llama_file file(info->filename.c_str(), "rb");
|
|
||||||
if (file.fp == NULL) {
|
|
||||||
fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
|
|
||||||
info->filename.c_str());
|
|
||||||
free_lora(result);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_init_params params_ggml;
|
|
||||||
params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
|
|
||||||
params_ggml.mem_buffer = NULL;
|
|
||||||
params_ggml.no_alloc = true;
|
|
||||||
result->ctx = ggml_init(params_ggml);
|
|
||||||
|
|
||||||
uint32_t magic = file.read_u32();
|
|
||||||
if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
|
||||||
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
|
|
||||||
}
|
|
||||||
uint32_t version = file.read_u32();
|
|
||||||
if (version != 1) {
|
|
||||||
die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
|
|
||||||
}
|
|
||||||
result->lora_r = file.read_u32();
|
|
||||||
result->lora_alpha = file.read_u32();
|
|
||||||
// read tensor infos from file
|
|
||||||
std::vector<char> name_buf;
|
|
||||||
std::vector<struct ggml_tensor *> tensors;
|
|
||||||
std::vector<size_t> tensors_offset;
|
|
||||||
size_t total_nbytes_pad = 0;
|
|
||||||
while(!file.eof()) {
|
|
||||||
int64_t ne[4] = {1,1,1,1};
|
|
||||||
uint32_t n_dims = file.read_u32();
|
|
||||||
uint32_t namelen = file.read_u32();
|
|
||||||
uint32_t type = file.read_u32();
|
|
||||||
for (uint32_t k = 0; k < n_dims; ++k) {
|
|
||||||
ne[k] = (int64_t)file.read_u32();
|
|
||||||
}
|
|
||||||
name_buf.clear();
|
|
||||||
name_buf.resize(namelen + 1, '\0');
|
|
||||||
file.read_raw(name_buf.data(), namelen);
|
|
||||||
file.seek((0-file.tell()) & 31, SEEK_CUR);
|
|
||||||
size_t offset = file.tell();
|
|
||||||
struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
|
|
||||||
ggml_set_name(tensor, name_buf.data());
|
|
||||||
size_t nbytes = ggml_nbytes(tensor);
|
|
||||||
size_t nbytes_pad = ggml_nbytes_pad(tensor);
|
|
||||||
total_nbytes_pad += nbytes_pad;
|
|
||||||
tensors.push_back(tensor);
|
|
||||||
tensors_offset.push_back(offset);
|
|
||||||
file.seek(nbytes, SEEK_CUR);
|
|
||||||
}
|
|
||||||
// read tensor data
|
|
||||||
result->data.resize(total_nbytes_pad);
|
|
||||||
size_t data_offset = 0;
|
|
||||||
for (size_t i = 0; i < tensors.size(); ++i) {
|
|
||||||
struct ggml_tensor * tensor = tensors[i];
|
|
||||||
size_t offset = tensors_offset[i];
|
|
||||||
size_t nbytes = ggml_nbytes(tensor);
|
|
||||||
size_t nbytes_pad = ggml_nbytes_pad(tensor);
|
|
||||||
file.seek(offset, SEEK_SET);
|
|
||||||
tensor->data = result->data.data() + data_offset;
|
|
||||||
file.read_raw(tensor->data, nbytes);
|
|
||||||
data_offset += nbytes_pad;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static struct ggml_cgraph * build_graph_lora(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * tensor,
|
|
||||||
struct ggml_tensor * lora_a,
|
|
||||||
struct ggml_tensor * lora_b,
|
|
||||||
float scaling
|
|
||||||
) {
|
|
||||||
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
|
|
||||||
if (scaling != 1.0f) {
|
|
||||||
ab = ggml_scale(ctx, ab, scaling);
|
|
||||||
}
|
|
||||||
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
|
|
||||||
|
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
|
||||||
ggml_build_forward_expand (gf, res);
|
|
||||||
return gf;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
|
|
||||||
if (lora->ctx == NULL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
std::string name = ggml_get_name(tensor);
|
|
||||||
std::string name_a = name + std::string(".loraA");
|
|
||||||
std::string name_b = name + std::string(".loraB");
|
|
||||||
struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
|
|
||||||
struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
|
|
||||||
if (lora_a == NULL || lora_b == NULL) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
|
|
||||||
|
|
||||||
struct ggml_init_params params;
|
|
||||||
params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
|
|
||||||
params.mem_buffer = NULL;
|
|
||||||
params.no_alloc = true;
|
|
||||||
struct ggml_context * ctx = NULL;
|
|
||||||
struct ggml_gallocr * alloc = NULL;
|
|
||||||
struct ggml_cgraph * gf = NULL;
|
|
||||||
|
|
||||||
ctx = ggml_init(params);
|
|
||||||
alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
|
||||||
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
|
|
||||||
|
|
||||||
ggml_gallocr_alloc_graph(alloc, gf);
|
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
|
|
||||||
static std::vector<uint8_t> data_work;
|
|
||||||
data_work.resize(cplan.work_size);
|
|
||||||
cplan.work_data = data_work.data();
|
|
||||||
|
|
||||||
ggml_graph_compute(gf, &cplan);
|
|
||||||
|
|
||||||
ggml_gallocr_free(alloc);
|
|
||||||
ggml_free(ctx);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void export_lora(struct export_lora_params * params) {
|
|
||||||
// load all loras
|
|
||||||
std::vector<struct lora_data *> loras;
|
|
||||||
for (size_t i = 0; i < params->lora.size(); ++i) {
|
|
||||||
struct lora_data * lora = load_lora(¶ms->lora[i]);
|
|
||||||
if (lora != NULL) {
|
|
||||||
loras.push_back(lora);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (loras.size() == 0) {
|
|
||||||
fprintf(stderr, "warning: no lora adapters will be applied.\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
// open input file
|
|
||||||
struct llama_file fin(params->fn_model_base.c_str(), "rb");
|
|
||||||
if (!fin.fp) {
|
|
||||||
die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// open base model gguf, read tensors without their data
|
|
||||||
struct ggml_context * ctx_in;
|
|
||||||
struct gguf_init_params params_gguf;
|
|
||||||
params_gguf.no_alloc = true;
|
|
||||||
params_gguf.ctx = &ctx_in;
|
|
||||||
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
|
|
||||||
|
|
||||||
// create new gguf
|
|
||||||
struct gguf_context * gguf_out = gguf_init_empty();
|
|
||||||
|
|
||||||
// copy meta data from base model: kv and tensors
|
|
||||||
gguf_set_kv(gguf_out, gguf_in);
|
|
||||||
int n_tensors = gguf_get_n_tensors(gguf_in);
|
|
||||||
for (int i=0; i < n_tensors; ++i) {
|
|
||||||
const char * name = gguf_get_tensor_name(gguf_in, i);
|
|
||||||
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
|
|
||||||
gguf_add_tensor(gguf_out, tensor);
|
|
||||||
}
|
|
||||||
|
|
||||||
// create output file
|
|
||||||
struct llama_file fout(params->fn_model_out.c_str(), "wb");
|
|
||||||
if (!fout.fp) {
|
|
||||||
die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// write gguf meta data
|
|
||||||
std::vector<uint8_t> meta;
|
|
||||||
meta.resize(gguf_get_meta_size(gguf_out));
|
|
||||||
gguf_get_meta_data(gguf_out, meta.data());
|
|
||||||
fout.write_raw(meta.data(), meta.size());
|
|
||||||
|
|
||||||
std::vector<uint8_t> data;
|
|
||||||
std::vector<uint8_t> padding;
|
|
||||||
for (int i=0; i < n_tensors; ++i) {
|
|
||||||
const char * name = gguf_get_tensor_name(gguf_in, i);
|
|
||||||
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
|
|
||||||
|
|
||||||
// read tensor data
|
|
||||||
data.resize(ggml_nbytes(tensor));
|
|
||||||
tensor->data = data.data();
|
|
||||||
size_t offset = gguf_get_tensor_offset(gguf_in, i);
|
|
||||||
fin.seek(offset + meta.size(), SEEK_SET);
|
|
||||||
fin.read_raw(data.data(), data.size());
|
|
||||||
|
|
||||||
// apply all loras
|
|
||||||
for (size_t k = 0; k < loras.size(); ++k) {
|
|
||||||
apply_lora(tensor, loras[k], params->n_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
// write tensor data + padding
|
|
||||||
padding.clear();
|
|
||||||
padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
|
|
||||||
|
|
||||||
GGML_ASSERT(fout.tell() == offset + meta.size());
|
|
||||||
// fout.seek(offset + meta.size(), SEEK_SET);
|
|
||||||
fout.write_raw(data.data(), data.size());
|
|
||||||
fout.write_raw(padding.data(), padding.size());
|
|
||||||
|
|
||||||
if (i % 2 == 0) {
|
|
||||||
printf(".");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
// close gguf
|
|
||||||
gguf_free(gguf_out);
|
|
||||||
gguf_free(gguf_in);
|
|
||||||
|
|
||||||
// free loras
|
|
||||||
for (size_t i = 0; i < loras.size(); ++i) {
|
|
||||||
free_lora(loras[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
struct export_lora_params params = get_default_export_lora_params();
|
gpt_params params;
|
||||||
|
|
||||||
if (!export_lora_params_parse(argc, argv, ¶ms)) {
|
if (!gpt_params_parse(argc, argv, params)) {
|
||||||
|
print_usage(argc, argv, params);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
export_lora(¶ms);
|
g_verbose = (params.verbosity == 1);
|
||||||
|
try {
|
||||||
|
lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
|
||||||
|
ctx.run_merge();
|
||||||
|
} catch (const std::exception & err) {
|
||||||
|
fprintf(stderr, "%s\n", err.what());
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("done, output file is %s\n", params.lora_outfile.c_str());
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
set(TARGET llama-finetune)
|
|
||||||
add_executable(${TARGET} finetune.cpp)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
|
@ -1,90 +0,0 @@
|
||||||
# finetune
|
|
||||||
|
|
||||||
Basic usage instructions:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# get training data
|
|
||||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
|
||||||
|
|
||||||
# finetune LORA adapter
|
|
||||||
./bin/llama-finetune \
|
|
||||||
--model-base open-llama-3b-v2-q8_0.gguf \
|
|
||||||
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
|
||||||
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
|
||||||
--lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
|
|
||||||
--train-data "shakespeare.txt" \
|
|
||||||
--save-every 10 \
|
|
||||||
--threads 6 --adam-iter 30 --batch 4 --ctx 64 \
|
|
||||||
--use-checkpointing
|
|
||||||
|
|
||||||
# predict
|
|
||||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
|
||||||
```
|
|
||||||
|
|
||||||
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
|
||||||
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
|
||||||
So in above example after 10 iterations these files will be written:
|
|
||||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
|
||||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
|
|
||||||
- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
|
|
||||||
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
|
||||||
|
|
||||||
After 10 more iterations:
|
|
||||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
|
|
||||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
|
|
||||||
- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
|
|
||||||
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
|
||||||
|
|
||||||
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
|
||||||
|
|
||||||
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
|
||||||
These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above.
|
|
||||||
|
|
||||||
In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together.
|
|
||||||
|
|
||||||
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
|
||||||
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
|
||||||
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
|
||||||
```
|
|
||||||
|
|
||||||
You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
|
|
||||||
|
|
||||||
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \
|
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
|
||||||
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
|
||||||
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
|
||||||
```
|
|
||||||
|
|
||||||
The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
|
|
||||||
|
|
||||||
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
|
|
||||||
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
|
|
||||||
|
|
||||||
The default LORA rank can be specified with `--lora-r N`.
|
|
||||||
The LORA rank can be configured for each model tensor type separately with these command line options:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
--lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
|
|
||||||
--rank-att-norm N LORA rank for attention norm tensor (default 1)
|
|
||||||
--rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1)
|
|
||||||
--rank-out-norm N LORA rank for output norm tensor (default 1)
|
|
||||||
--rank-tok-embd N LORA rank for token embeddings tensor (default 4)
|
|
||||||
--rank-out N LORA rank for output tensor (default 4)
|
|
||||||
--rank-wq N LORA rank for wq tensor (default 4)
|
|
||||||
--rank-wk N LORA rank for wk tensor (default 4)
|
|
||||||
--rank-wv N LORA rank for wv tensor (default 4)
|
|
||||||
--rank-wo N LORA rank for wo tensor (default 4)
|
|
||||||
--rank-ffn_gate N LORA rank for ffn_gate tensor (default 4)
|
|
||||||
--rank-ffn_down N LORA rank for ffn_down tensor (default 4)
|
|
||||||
--rank-ffn_up N LORA rank for ffn_up tensor (default 4)
|
|
||||||
```
|
|
||||||
|
|
||||||
The LORA rank of 'norm' tensors should always be 1.
|
|
||||||
|
|
||||||
To see all available options use `llama-finetune --help`.
|
|
|
@ -1,487 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# finetune checkpoint --> gguf conversion
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import gguf
|
|
||||||
import struct
|
|
||||||
import numpy as np
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# gguf constants
|
|
||||||
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
|
||||||
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
|
||||||
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
|
||||||
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
|
||||||
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
|
||||||
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
|
||||||
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
|
||||||
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
|
||||||
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
|
||||||
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
|
||||||
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
|
||||||
|
|
||||||
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
|
||||||
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
|
||||||
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
|
||||||
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
|
||||||
|
|
||||||
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
|
|
||||||
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
|
|
||||||
LLM_KV_TRAINING_TYPE = "training.type"
|
|
||||||
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
|
||||||
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
|
||||||
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
|
||||||
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
|
||||||
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
|
|
||||||
LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
|
|
||||||
|
|
||||||
class Tensor:
|
|
||||||
def __init__(self, dtype='f', ne=None):
|
|
||||||
if ne is None:
|
|
||||||
ne = []
|
|
||||||
self.dtype = dtype
|
|
||||||
self.ne = ne
|
|
||||||
self.nbytes = 0
|
|
||||||
if self.dtype == 'f':
|
|
||||||
if len(self.ne) == 0:
|
|
||||||
self.nbytes = 0
|
|
||||||
else:
|
|
||||||
self.nbytes = int(np.prod(self.ne)) * 4
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
assert(nd == len(self.ne))
|
|
||||||
ne = []
|
|
||||||
for d in range(nd):
|
|
||||||
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
ne.append(n)
|
|
||||||
|
|
||||||
if tuple(ne) != tuple(self.ne):
|
|
||||||
raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
|
|
||||||
|
|
||||||
if self.dtype == 'f':
|
|
||||||
assert(dtype == 0)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
|
||||||
|
|
||||||
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
|
||||||
# 32-byte alignment
|
|
||||||
offset += (0 - offset) & 31
|
|
||||||
self.data = data[offset:offset+self.nbytes]
|
|
||||||
offset += self.nbytes
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def max_storage_size(self):
|
|
||||||
result = 0
|
|
||||||
result += 4 # nd
|
|
||||||
result += 4 # namelen
|
|
||||||
result += 4 # dtype
|
|
||||||
result += len(self.ne)*8 # ne
|
|
||||||
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
|
||||||
result += 31 # 32-byte alignment
|
|
||||||
result += self.nbytes
|
|
||||||
return result
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer, name):
|
|
||||||
gguf_writer.add_tensor(
|
|
||||||
name=name,
|
|
||||||
tensor=self.data,
|
|
||||||
raw_shape=np.array(list(reversed(self.ne))),
|
|
||||||
raw_dtype=gguf.GGMLQuantizationType.F32)
|
|
||||||
|
|
||||||
class OptimizationContext:
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
|
||||||
offset += 4
|
|
||||||
|
|
||||||
if self.version != 1:
|
|
||||||
raise ValueError('Invalid version of optimization context in checkpoint file')
|
|
||||||
|
|
||||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
|
||||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
|
||||||
|
|
||||||
self.adam_m = Tensor('f', [self.nx])
|
|
||||||
self.adam_v = Tensor('f', [self.nx])
|
|
||||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
|
||||||
|
|
||||||
self.lbfgs_x = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_g = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_d = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
|
||||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
|
||||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
|
||||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
|
||||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
|
||||||
|
|
||||||
# forgot to save type in version 1:
|
|
||||||
# guess self.type from number of remaining bytes
|
|
||||||
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
|
||||||
[self.adam_m, self.adam_v]
|
|
||||||
+([self.adam_pf] if (self.past > 0) else [])])
|
|
||||||
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
|
||||||
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
|
||||||
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
|
||||||
self.lbfgs_lmal, self.lbfgs_lmys,
|
|
||||||
self.lbfgs_lms, self.lbfgs_lmy]
|
|
||||||
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
|
||||||
# due to alignment padding the size might not by exact
|
|
||||||
# but the difference in size for both types is significant,
|
|
||||||
# so we can just use whichever is closest
|
|
||||||
remaining = len(data) - offset
|
|
||||||
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
|
||||||
self.type = 0
|
|
||||||
else:
|
|
||||||
self.type = 1
|
|
||||||
|
|
||||||
if self.type == 0:
|
|
||||||
offset = self.adam_m.load(data, offset)
|
|
||||||
offset = self.adam_v.load(data, offset)
|
|
||||||
offset = self.adam_pf.load(data,offset)
|
|
||||||
|
|
||||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
elif self.type == 1:
|
|
||||||
offset = self.lbfgs_x.load(data, offset)
|
|
||||||
offset = self.lbfgs_xp.load(data, offset)
|
|
||||||
offset = self.lbfgs_g.load(data, offset)
|
|
||||||
offset = self.lbfgs_gp.load(data, offset)
|
|
||||||
offset = self.lbfgs_d.load(data, offset)
|
|
||||||
offset = self.lbfgs_pf.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmal.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmys.load(data, offset)
|
|
||||||
offset = self.lbfgs_lms.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmy.load(data, offset)
|
|
||||||
|
|
||||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Invalid optimizer type '{self.type}'")
|
|
||||||
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
|
||||||
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
|
||||||
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
|
||||||
|
|
||||||
if self.type == 0:
|
|
||||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
|
||||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
|
||||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
|
||||||
|
|
||||||
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
|
||||||
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
|
||||||
if self.past > 0:
|
|
||||||
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
|
||||||
|
|
||||||
elif self.type == 1:
|
|
||||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
|
||||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
|
||||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
|
||||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
|
||||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
|
||||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
|
||||||
|
|
||||||
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
|
||||||
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
|
||||||
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
|
||||||
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
|
||||||
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
|
||||||
if self.past > 0:
|
|
||||||
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
|
||||||
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
|
||||||
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
|
||||||
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
|
||||||
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
|
||||||
else:
|
|
||||||
raise ValueError('Unknown optimizer type')
|
|
||||||
|
|
||||||
class LoraParams:
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
|
|
||||||
|
|
||||||
class ModelParams:
|
|
||||||
def __init__(self, n_ff = None):
|
|
||||||
self.n_ff = n_ff
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def get_n_ff(self):
|
|
||||||
if self.n_ff is None:
|
|
||||||
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
|
||||||
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
|
||||||
else:
|
|
||||||
return self.n_ff
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
# self.n_vocab not saved
|
|
||||||
gguf_writer.add_embedding_length(self.n_embd)
|
|
||||||
gguf_writer.add_head_count(self.n_head)
|
|
||||||
gguf_writer.add_block_count(self.n_layer)
|
|
||||||
gguf_writer.add_rope_dimension_count(self.n_rot)
|
|
||||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
|
||||||
|
|
||||||
def tensor_name(key, bid=None, suffix=".weight"):
|
|
||||||
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
|
|
||||||
|
|
||||||
class Layer:
|
|
||||||
def __init__(self, params, lora_params, bid):
|
|
||||||
self.bid = bid
|
|
||||||
self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
|
|
||||||
self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
|
|
||||||
self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
|
|
||||||
self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
|
|
||||||
self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
|
|
||||||
self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
|
|
||||||
self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
|
|
||||||
self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
|
|
||||||
self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
|
|
||||||
self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
|
|
||||||
self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
|
|
||||||
self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
|
|
||||||
self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
|
|
||||||
self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
|
|
||||||
self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
|
|
||||||
self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
|
|
||||||
self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
|
|
||||||
self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
offset = self.att_norm_a.load(data, offset)
|
|
||||||
offset = self.att_norm_b.load(data, offset)
|
|
||||||
offset = self.wq_a.load(data, offset)
|
|
||||||
offset = self.wq_b.load(data, offset)
|
|
||||||
offset = self.wk_a.load(data, offset)
|
|
||||||
offset = self.wk_b.load(data, offset)
|
|
||||||
offset = self.wv_a.load(data, offset)
|
|
||||||
offset = self.wv_b.load(data, offset)
|
|
||||||
offset = self.wo_a.load(data, offset)
|
|
||||||
offset = self.wo_b.load(data, offset)
|
|
||||||
offset = self.ffn_norm_a.load(data, offset)
|
|
||||||
offset = self.ffn_norm_b.load(data, offset)
|
|
||||||
offset = self.w1_a.load(data, offset)
|
|
||||||
offset = self.w1_b.load(data, offset)
|
|
||||||
offset = self.w2_a.load(data, offset)
|
|
||||||
offset = self.w2_b.load(data, offset)
|
|
||||||
offset = self.w3_a.load(data, offset)
|
|
||||||
offset = self.w3_b.load(data, offset)
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
|
|
||||||
self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
|
|
||||||
self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
|
|
||||||
self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
|
|
||||||
self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
|
|
||||||
self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
|
|
||||||
self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
|
|
||||||
self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
|
|
||||||
self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
|
|
||||||
self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
|
|
||||||
self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
|
|
||||||
self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
|
|
||||||
self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
|
|
||||||
self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
|
|
||||||
self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
|
|
||||||
self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
|
|
||||||
self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
|
|
||||||
self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
|
|
||||||
|
|
||||||
class LoraModel:
|
|
||||||
def __init__(self, n_ff = None):
|
|
||||||
self.params = ModelParams(n_ff = n_ff)
|
|
||||||
self.lora_params = LoraParams()
|
|
||||||
self.layers = []
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
offset = self.params.load(data, offset)
|
|
||||||
offset = self.lora_params.load(data, offset)
|
|
||||||
|
|
||||||
self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
|
|
||||||
self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
|
|
||||||
self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
|
|
||||||
self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
|
|
||||||
self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
|
|
||||||
self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
|
|
||||||
|
|
||||||
offset = self.tok_embd_a.load(data, offset)
|
|
||||||
offset = self.tok_embd_b.load(data, offset)
|
|
||||||
offset = self.norm_a.load(data, offset)
|
|
||||||
offset = self.norm_b.load(data, offset)
|
|
||||||
offset = self.output_a.load(data, offset)
|
|
||||||
offset = self.output_b.load(data, offset)
|
|
||||||
|
|
||||||
self.layers.clear()
|
|
||||||
for bid in range(self.params.n_layer):
|
|
||||||
layer = Layer(self.params, self.lora_params, bid)
|
|
||||||
offset = layer.load(data, offset)
|
|
||||||
self.layers.append(layer)
|
|
||||||
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
self.params.save_gguf(gguf_writer)
|
|
||||||
self.lora_params.save_gguf(gguf_writer)
|
|
||||||
|
|
||||||
self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
|
|
||||||
self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
|
|
||||||
self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
|
|
||||||
self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
|
|
||||||
self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
|
|
||||||
self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
|
|
||||||
|
|
||||||
for layer in self.layers:
|
|
||||||
layer.save_gguf(gguf_writer)
|
|
||||||
|
|
||||||
class LoraCheckpoint:
|
|
||||||
def __init__(self, n_ff = None):
|
|
||||||
self.model = LoraModel(n_ff = n_ff)
|
|
||||||
self.opt_ctx = OptimizationContext()
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
|
||||||
if magic != b'ggcl':
|
|
||||||
raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
|
|
||||||
|
|
||||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
if self.version != 0:
|
|
||||||
raise ValueError('Invalid version of checkpoint file')
|
|
||||||
|
|
||||||
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
offset = self.model.load(data, offset)
|
|
||||||
offset = self.opt_ctx.load(data, offset)
|
|
||||||
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
|
||||||
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
|
||||||
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
|
||||||
self.model.save_gguf(gguf_writer)
|
|
||||||
self.opt_ctx.save_gguf(gguf_writer)
|
|
||||||
|
|
||||||
def handle_args():
|
|
||||||
parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
|
|
||||||
parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
|
|
||||||
parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
|
|
||||||
parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def main():
|
|
||||||
cfg = handle_args()
|
|
||||||
print(cfg)
|
|
||||||
data = np.memmap(cfg.input, mode = 'r')
|
|
||||||
chk = LoraCheckpoint(n_ff = cfg.ff)
|
|
||||||
offset = 0
|
|
||||||
offset = chk.load(data, offset)
|
|
||||||
# we should have read all available data
|
|
||||||
assert(offset == len(data))
|
|
||||||
|
|
||||||
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
|
||||||
chk.save_gguf(gguf_writer)
|
|
||||||
print(" gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print(" gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
print(" gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,34 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
cd `dirname $0`
|
|
||||||
cd ../..
|
|
||||||
|
|
||||||
EXE="./llama-finetune"
|
|
||||||
|
|
||||||
if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
|
|
||||||
if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
|
|
||||||
|
|
||||||
# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
|
|
||||||
MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
|
|
||||||
|
|
||||||
while getopts "dg" opt; do
|
|
||||||
case $opt in
|
|
||||||
d)
|
|
||||||
DEBUGGER="gdb --args"
|
|
||||||
;;
|
|
||||||
g)
|
|
||||||
EXE="./build/bin/Release/finetune"
|
|
||||||
GPUARG="--gpu-layers 25"
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
$DEBUGGER $EXE \
|
|
||||||
--model-base $MODEL \
|
|
||||||
$GPUARG \
|
|
||||||
--checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \
|
|
||||||
--checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
|
|
||||||
--lora-out lora-ol3b-shakespeare-ITERATION.bin \
|
|
||||||
--train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
|
|
||||||
--save-every 10 \
|
|
||||||
--threads 10 --adam-iter 30 --batch 4 --ctx 64 \
|
|
||||||
--use-checkpointing
|
|
|
@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
|
||||||
auto decoded = decode_utf8(input_str, {});
|
auto decoded = decode_utf8(input_str, {});
|
||||||
const auto & code_points = decoded.first;
|
const auto & code_points = decoded.first;
|
||||||
|
|
||||||
|
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
|
||||||
|
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
||||||
|
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
auto prev_stacks = grammar->stacks;
|
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
|
||||||
llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
|
|
||||||
if (grammar->stacks.empty()) {
|
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
|
||||||
|
|
||||||
|
if (cur_stacks.empty()) {
|
||||||
error_pos = pos;
|
error_pos = pos;
|
||||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
||||||
grammar->stacks = prev_stacks;
|
cur_stacks = prev_stacks;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
++pos;
|
++pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & stack : grammar->stacks) {
|
for (const auto & stack : cur_stacks) {
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
|
||||||
|
|
||||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||||
|
|
||||||
|
if (!ctx) {
|
||||||
|
fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
|
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
|
||||||
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||||
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# llama.cpp/examples/imatrix
|
# llama.cpp/examples/imatrix
|
||||||
|
|
||||||
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
|
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
|
||||||
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
|
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
|
@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||||
exit(1); //GGML_ASSERT(false);
|
exit(1); //GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
if (m_params.verbosity > 1) {
|
if (m_params.verbosity > 1) {
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
||||||
|
@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||||
exit(1); //GGML_ASSERT(false);
|
exit(1); //GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
if (m_params.verbosity > 1) {
|
if (m_params.verbosity > 1) {
|
||||||
|
|
|
@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) {
|
||||||
case JSON: return "json";
|
case JSON: return "json";
|
||||||
case MARKDOWN: return "md";
|
case MARKDOWN: return "md";
|
||||||
case SQL: return "sql";
|
case SQL: return "sql";
|
||||||
default: GGML_ASSERT(!"invalid output format");
|
default: GGML_ABORT("invalid output format");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
|
||||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||||
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
||||||
case LLAMA_SPLIT_MODE_ROW: return "row";
|
case LLAMA_SPLIT_MODE_ROW: return "row";
|
||||||
default: GGML_ASSERT(!"invalid split mode");
|
default: GGML_ABORT("invalid split mode");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1326,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||||
case SQL:
|
case SQL:
|
||||||
return std::unique_ptr<printer>(new sql_printer());
|
return std::unique_ptr<printer>(new sql_printer());
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
|
|
|
@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
|
|
||||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
return env->NewStringUTF("");
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
||||||
|
|
|
@ -26,11 +26,12 @@ actor LlamaContext {
|
||||||
private var context: OpaquePointer
|
private var context: OpaquePointer
|
||||||
private var batch: llama_batch
|
private var batch: llama_batch
|
||||||
private var tokens_list: [llama_token]
|
private var tokens_list: [llama_token]
|
||||||
|
var is_done: Bool = false
|
||||||
|
|
||||||
/// This variable is used to store temporarily invalid cchars
|
/// This variable is used to store temporarily invalid cchars
|
||||||
private var temporary_invalid_cchars: [CChar]
|
private var temporary_invalid_cchars: [CChar]
|
||||||
|
|
||||||
var n_len: Int32 = 64
|
var n_len: Int32 = 1024
|
||||||
var n_cur: Int32 = 0
|
var n_cur: Int32 = 0
|
||||||
|
|
||||||
var n_decode: Int32 = 0
|
var n_decode: Int32 = 0
|
||||||
|
@ -160,6 +161,7 @@ actor LlamaContext {
|
||||||
|
|
||||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||||
print("\n")
|
print("\n")
|
||||||
|
is_done = true
|
||||||
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
||||||
temporary_invalid_cchars.removeAll()
|
temporary_invalid_cchars.removeAll()
|
||||||
return new_token_str
|
return new_token_str
|
||||||
|
|
|
@ -132,7 +132,7 @@ class LlamaState: ObservableObject {
|
||||||
messageLog += "\(text)"
|
messageLog += "\(text)"
|
||||||
|
|
||||||
Task.detached {
|
Task.detached {
|
||||||
while await llamaContext.n_cur < llamaContext.n_len {
|
while await !llamaContext.is_done {
|
||||||
let result = await llamaContext.completion_loop()
|
let result = await llamaContext.completion_loop()
|
||||||
await MainActor.run {
|
await MainActor.run {
|
||||||
self.messageLog += "\(result)"
|
self.messageLog += "\(result)"
|
||||||
|
|
|
@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
embeddings = peg_0;
|
embeddings = peg_0;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
|
||||||
auto formatted = llama_chat_format_single(
|
auto formatted = llama_chat_format_single(
|
||||||
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||||
chat_msgs.push_back({role, content});
|
chat_msgs.push_back({role, content});
|
||||||
|
LOG("formatted: %s\n", formatted.c_str());
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
343
examples/pydantic_models_to_grammar_examples.py
Normal file → Executable file
343
examples/pydantic_models_to_grammar_examples.py
Normal file → Executable file
|
@ -1,8 +1,15 @@
|
||||||
# Function calling example using pydantic models.
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""Function calling example using pydantic models."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
|
import textwrap
|
||||||
|
import sys
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert
|
||||||
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
|
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
|
||||||
|
|
||||||
|
|
||||||
# Function to get completion on the llama.cpp server with grammar.
|
def create_completion(host, prompt, gbnf_grammar):
|
||||||
def create_completion(prompt, grammar):
|
"""Calls the /completion API on llama-server.
|
||||||
|
|
||||||
|
See
|
||||||
|
https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
|
||||||
|
"""
|
||||||
|
print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}")
|
||||||
headers = {"Content-Type": "application/json"}
|
headers = {"Content-Type": "application/json"}
|
||||||
data = {"prompt": prompt, "grammar": grammar}
|
data = {"prompt": prompt, "grammar": gbnf_grammar}
|
||||||
|
result = requests.post(f"http://{host}/completion", headers=headers, json=data).json()
|
||||||
response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data)
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
assert data.get("error") is None, data
|
assert data.get("error") is None, data
|
||||||
|
logging.info("Result: %s", result)
|
||||||
print(data["content"])
|
content = result["content"]
|
||||||
return data["content"]
|
print(f" Model: {result['model']}")
|
||||||
|
print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}")
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
# A function for the agent to send a message to the user.
|
# A function for the agent to send a message to the user.
|
||||||
class SendMessageToUser(BaseModel):
|
class SendMessageToUser(BaseModel):
|
||||||
"""
|
"""Send a message to the User."""
|
||||||
Send a message to the User.
|
|
||||||
"""
|
|
||||||
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
|
chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.")
|
||||||
message: str = Field(..., description="Message you want to send to the user.")
|
message: str = Field(..., description="Message you want to send to the user.")
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
print(self.message)
|
print(f"SendMessageToUser: {self.message}")
|
||||||
|
|
||||||
|
|
||||||
|
def example_rce(host):
|
||||||
|
"""Minimal test case where the LLM call an arbitrary python function."""
|
||||||
|
print("- example_rce")
|
||||||
|
tools = [SendMessageToUser]
|
||||||
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||||
|
pydantic_model_list=tools, outer_object_name="function",
|
||||||
|
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
|
||||||
|
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
|
||||||
|
user_message = "What is 42 * 42?"
|
||||||
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
|
||||||
|
text = create_completion(host, prompt, gbnf_grammar)
|
||||||
|
json_data = json.loads(text)
|
||||||
|
tools_map = {tool.__name__:tool for tool in tools}
|
||||||
|
# This finds "SendMessageToUser":
|
||||||
|
tool = tools_map.get(json_data["function"])
|
||||||
|
if not tool:
|
||||||
|
print(f"Error: unknown tool {json_data['function']}")
|
||||||
|
return 1
|
||||||
|
tool(**json_data["function_parameters"]).run()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
# Enum for the calculator tool.
|
# Enum for the calculator tool.
|
||||||
|
@ -46,11 +77,11 @@ class MathOperation(Enum):
|
||||||
DIVIDE = "divide"
|
DIVIDE = "divide"
|
||||||
|
|
||||||
|
|
||||||
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
# Simple pydantic calculator tool for the agent that can add, subtract,
|
||||||
|
# multiply, and divide. Docstring and description of fields will be used in
|
||||||
|
# system prompt.
|
||||||
class Calculator(BaseModel):
|
class Calculator(BaseModel):
|
||||||
"""
|
"""Perform a math operation on two numbers."""
|
||||||
Perform a math operation on two numbers.
|
|
||||||
"""
|
|
||||||
number_one: Union[int, float] = Field(..., description="First number.")
|
number_one: Union[int, float] = Field(..., description="First number.")
|
||||||
operation: MathOperation = Field(..., description="Math operation to perform.")
|
operation: MathOperation = Field(..., description="Math operation to perform.")
|
||||||
number_two: Union[int, float] = Field(..., description="Second number.")
|
number_two: Union[int, float] = Field(..., description="Second number.")
|
||||||
|
@ -68,55 +99,61 @@ class Calculator(BaseModel):
|
||||||
raise ValueError("Unknown operation.")
|
raise ValueError("Unknown operation.")
|
||||||
|
|
||||||
|
|
||||||
# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM.
|
def example_calculator(host):
|
||||||
# pydantic_model_list is the list of pydanitc models
|
"""Have the LLM ask to get a calculation done.
|
||||||
# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated
|
|
||||||
# outer_object_content is the name of outer object content.
|
|
||||||
# model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
|
|
||||||
# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
|
|
||||||
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
|
||||||
pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function",
|
|
||||||
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
|
|
||||||
|
|
||||||
print(gbnf_grammar)
|
Here the grammar gets generated by passing the available function models to
|
||||||
print(documentation)
|
generate_gbnf_grammar_and_documentation function. This also generates a
|
||||||
|
documentation usable by the LLM.
|
||||||
|
|
||||||
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
|
pydantic_model_list is the list of pydantic models outer_object_name is an
|
||||||
|
optional name for an outer object around the actual model object. Like a
|
||||||
|
"function" object with "function_parameters" which contains the actual model
|
||||||
|
object. If None, no outer object will be generated outer_object_content is
|
||||||
|
the name of outer object content.
|
||||||
|
|
||||||
user_message = "What is 42 * 42?"
|
model_prefix is the optional prefix for models in the documentation. (Default="Output Model")
|
||||||
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant"
|
fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields")
|
||||||
|
"""
|
||||||
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
print("- example_calculator")
|
||||||
# This should output something like this:
|
tools = [SendMessageToUser, Calculator]
|
||||||
# {
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||||
# "function": "calculator",
|
pydantic_model_list=tools, outer_object_name="function",
|
||||||
# "function_parameters": {
|
outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters")
|
||||||
# "number_one": 42,
|
system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation
|
||||||
# "operation": "multiply",
|
user_message1 = "What is 42 * 42?"
|
||||||
# "number_two": 42
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant"
|
||||||
# }
|
text = create_completion(host, prompt, gbnf_grammar)
|
||||||
# }
|
json_data = json.loads(text)
|
||||||
function_dictionary = json.loads(text)
|
expected = {
|
||||||
if function_dictionary["function"] == "calculator":
|
"function": "Calculator",
|
||||||
function_parameters = {**function_dictionary["function_parameters"]}
|
"function_parameters": {
|
||||||
|
"number_one": 42,
|
||||||
print(Calculator(**function_parameters).run())
|
"operation": "multiply",
|
||||||
# This should output: 1764
|
"number_two": 42
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if json_data != expected:
|
||||||
|
print(" Result is not as expected!")
|
||||||
|
tools_map = {tool.__name__:tool for tool in tools}
|
||||||
|
# This finds "Calculator":
|
||||||
|
tool = tools_map.get(json_data["function"])
|
||||||
|
if not tool:
|
||||||
|
print(f"Error: unknown tool {json_data['function']}")
|
||||||
|
return 1
|
||||||
|
result = tool(**json_data["function_parameters"]).run()
|
||||||
|
print(f" Call {json_data['function']} gave result {result}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text.
|
|
||||||
class Category(Enum):
|
class Category(Enum):
|
||||||
"""
|
"""The category of the book."""
|
||||||
The category of the book.
|
|
||||||
"""
|
|
||||||
Fiction = "Fiction"
|
Fiction = "Fiction"
|
||||||
NonFiction = "Non-Fiction"
|
NonFiction = "Non-Fiction"
|
||||||
|
|
||||||
|
|
||||||
class Book(BaseModel):
|
class Book(BaseModel):
|
||||||
"""
|
"""Represents an entry about a book."""
|
||||||
Represents an entry about a book.
|
|
||||||
"""
|
|
||||||
title: str = Field(..., description="Title of the book.")
|
title: str = Field(..., description="Title of the book.")
|
||||||
author: str = Field(..., description="Author of the book.")
|
author: str = Field(..., description="Author of the book.")
|
||||||
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
|
published_year: Optional[int] = Field(..., description="Publishing year of the book.")
|
||||||
|
@ -125,33 +162,42 @@ class Book(BaseModel):
|
||||||
summary: str = Field(..., description="Summary of the book.")
|
summary: str = Field(..., description="Summary of the book.")
|
||||||
|
|
||||||
|
|
||||||
# We need no additional parameters other than our list of pydantic models.
|
def example_struct(host):
|
||||||
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book])
|
"""A example structured output based on pydantic models.
|
||||||
|
|
||||||
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
|
The LLM will create an entry for a Book database out of an unstructured
|
||||||
|
text. We need no additional parameters other than our list of pydantic
|
||||||
|
models.
|
||||||
|
"""
|
||||||
|
print("- example_struct")
|
||||||
|
tools = [Book]
|
||||||
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools)
|
||||||
|
system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation
|
||||||
|
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
|
||||||
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
||||||
|
text = create_completion(host, prompt, gbnf_grammar)
|
||||||
|
json_data = json.loads(text)
|
||||||
|
# In this case, there's no function nor function_parameters.
|
||||||
|
# Here the result will vary based on the LLM used.
|
||||||
|
keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"])
|
||||||
|
if keys != sorted(json_data.keys()):
|
||||||
|
print(f"Unexpected result: {sorted(json_data.keys())}")
|
||||||
|
return 1
|
||||||
|
book = Book(**json_data)
|
||||||
|
print(f" As a Book object: %s" % book)
|
||||||
|
return 0
|
||||||
|
|
||||||
text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands."""
|
|
||||||
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
|
||||||
|
|
||||||
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
|
||||||
|
|
||||||
json_data = json.loads(text)
|
|
||||||
|
|
||||||
print(Book(**json_data))
|
|
||||||
# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
|
|
||||||
|
|
||||||
def get_current_datetime(output_format: Optional[str] = None):
|
def get_current_datetime(output_format: Optional[str] = None):
|
||||||
"""
|
"""Get the current date and time in the given format.
|
||||||
Get the current date and time in the given format.
|
|
||||||
Args:
|
Args:
|
||||||
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
|
output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
|
||||||
"""
|
"""
|
||||||
if output_format is None:
|
return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S")
|
||||||
output_format = '%Y-%m-%d %H:%M:%S'
|
|
||||||
return datetime.datetime.now().strftime(output_format)
|
|
||||||
|
|
||||||
|
|
||||||
# Example function to get the weather
|
# Example function to get the weather.
|
||||||
def get_current_weather(location, unit):
|
def get_current_weather(location, unit):
|
||||||
"""Get the current weather in a given location"""
|
"""Get the current weather in a given location"""
|
||||||
if "London" in location:
|
if "London" in location:
|
||||||
|
@ -160,68 +206,107 @@ def get_current_weather(location, unit):
|
||||||
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
|
return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
|
||||||
elif "North Pole" in location:
|
elif "North Pole" in location:
|
||||||
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
|
return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
|
||||||
else:
|
return json.dumps({"location": location, "temperature": "unknown"})
|
||||||
return json.dumps({"location": location, "temperature": "unknown"})
|
|
||||||
|
|
||||||
|
|
||||||
# Here is a function definition in OpenAI style
|
def example_concurrent(host):
|
||||||
current_weather_tool = {
|
"""An example for parallel function calling with a Python function, a pydantic
|
||||||
"type": "function",
|
function model and an OpenAI like function definition.
|
||||||
"function": {
|
"""
|
||||||
"name": "get_current_weather",
|
print("- example_concurrent")
|
||||||
"description": "Get the current weather in a given location",
|
# Function definition in OpenAI style.
|
||||||
"parameters": {
|
current_weather_tool = {
|
||||||
"type": "object",
|
"type": "function",
|
||||||
"properties": {
|
"function": {
|
||||||
"location": {
|
"name": "get_current_weather",
|
||||||
"type": "string",
|
"description": "Get the current weather in a given location",
|
||||||
"description": "The city and state, e.g. San Francisco, CA",
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"location": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "The city and state, e.g. San Francisco, CA",
|
||||||
|
},
|
||||||
|
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
||||||
},
|
},
|
||||||
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
|
"required": ["location"],
|
||||||
},
|
},
|
||||||
"required": ["location"],
|
|
||||||
},
|
},
|
||||||
},
|
}
|
||||||
}
|
# Convert OpenAI function definition into pydantic model.
|
||||||
|
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
|
||||||
|
# Add the actual function to a pydantic model.
|
||||||
|
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
|
||||||
|
|
||||||
# Convert OpenAI function definition into pydantic model
|
# Convert normal Python function to a pydantic model.
|
||||||
current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
|
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
|
||||||
# Add the actual function to a pydantic model
|
|
||||||
current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
|
|
||||||
|
|
||||||
# Convert normal Python function to a pydantic model
|
tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
|
||||||
current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
|
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
||||||
|
pydantic_model_list=tools, outer_object_name="function",
|
||||||
tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
|
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
|
||||||
|
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
|
||||||
|
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
|
||||||
|
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
||||||
|
text = create_completion(host, prompt, gbnf_grammar)
|
||||||
|
json_data = json.loads(text)
|
||||||
|
expected = [
|
||||||
|
{
|
||||||
|
"function": "get_current_datetime",
|
||||||
|
"params": {
|
||||||
|
"output_format": "%Y-%m-%d %H:%M:%S"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"function": "get_current_weather",
|
||||||
|
"params": {
|
||||||
|
"location": "London",
|
||||||
|
"unit": "celsius"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"function": "Calculator",
|
||||||
|
"params": {
|
||||||
|
"number_one": 42,
|
||||||
|
"operation": "multiply",
|
||||||
|
"number_two": 42
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
res = 0
|
||||||
|
if json_data != expected:
|
||||||
|
print(" Result is not as expected!")
|
||||||
|
print(" This can happen on highly quantized models")
|
||||||
|
res = 1
|
||||||
|
tools_map = {tool.__name__:tool for tool in tools}
|
||||||
|
for call in json_data:
|
||||||
|
tool = tools_map.get(call["function"])
|
||||||
|
if not tool:
|
||||||
|
print(f"Error: unknown tool {call['function']}")
|
||||||
|
return 1
|
||||||
|
result = tool(**call["params"]).run()
|
||||||
|
print(f" Call {call['function']} returned {result}")
|
||||||
|
# Should output something like this:
|
||||||
|
# Call get_current_datetime returned 2024-07-15 09:50:38
|
||||||
|
# Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"}
|
||||||
|
# Call Calculator returned 1764
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
|
def main():
|
||||||
pydantic_model_list=tool_list, outer_object_name="function",
|
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
|
||||||
outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
|
parser.add_argument("--host", default="localhost:8080", help="llama.cpp server")
|
||||||
|
parser.add_argument("-v", "--verbose", action="store_true", help="enables logging")
|
||||||
system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
|
args = parser.parse_args()
|
||||||
|
logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR)
|
||||||
|
ret = 0
|
||||||
|
# Comment out below to only run the example you want.
|
||||||
|
ret = ret or example_rce(args.host)
|
||||||
|
ret = ret or example_calculator(args.host)
|
||||||
|
ret = ret or example_struct(args.host)
|
||||||
|
ret = ret or example_concurrent(args.host)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
|
if __name__ == "__main__":
|
||||||
prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
|
sys.exit(main())
|
||||||
|
|
||||||
text = create_completion(prompt=prompt, grammar=gbnf_grammar)
|
|
||||||
|
|
||||||
json_data = json.loads(text)
|
|
||||||
|
|
||||||
print(json_data)
|
|
||||||
# Should output something like this:
|
|
||||||
# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
|
|
||||||
|
|
||||||
|
|
||||||
for call in json_data:
|
|
||||||
if call["function"] == "Calculator":
|
|
||||||
print(Calculator(**call["params"]).run())
|
|
||||||
elif call["function"] == "get_current_datetime":
|
|
||||||
print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
|
|
||||||
elif call["function"] == "get_current_weather":
|
|
||||||
print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue]
|
|
||||||
# Should output something like this:
|
|
||||||
# 2024-01-14 13:36:06
|
|
||||||
# {"location": "London", "temperature": "42", "unit": "celsius"}
|
|
||||||
# 1764
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ int main(int argc, char ** argv) {
|
||||||
// save state (rng, logits, embedding and kv_cache) to file
|
// save state (rng, logits, embedding and kv_cache) to file
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
|
std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
|
||||||
const size_t written = llama_state_get_data(ctx, state_mem.data());
|
const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
|
||||||
|
|
||||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||||
fwrite(state_mem.data(), 1, written, fp_write);
|
fwrite(state_mem.data(), 1, written, fp_write);
|
||||||
|
@ -99,13 +99,16 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load state (rng, logits, embedding and kv_cache) from file
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));
|
std::vector<uint8_t> state_mem;
|
||||||
|
|
||||||
FILE * fp_read = fopen("dump_state.bin", "rb");
|
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||||
|
fseek(fp_read, 0, SEEK_END);
|
||||||
|
state_mem.resize(ftell(fp_read));
|
||||||
|
fseek(fp_read, 0, SEEK_SET);
|
||||||
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||||
fclose(fp_read);
|
fclose(fp_read);
|
||||||
|
|
||||||
if (read != llama_state_set_data(ctx2, state_mem.data())) {
|
if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
llama_free(ctx2);
|
llama_free(ctx2);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -159,13 +162,16 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load state (rng, logits, embedding and kv_cache) from file
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
{
|
{
|
||||||
std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
|
std::vector<uint8_t> state_mem;
|
||||||
|
|
||||||
FILE * fp_read = fopen("dump_state.bin", "rb");
|
FILE * fp_read = fopen("dump_state.bin", "rb");
|
||||||
|
fseek(fp_read, 0, SEEK_END);
|
||||||
|
state_mem.resize(ftell(fp_read));
|
||||||
|
fseek(fp_read, 0, SEEK_SET);
|
||||||
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
|
||||||
fclose(fp_read);
|
fclose(fp_read);
|
||||||
|
|
||||||
if (read != llama_state_set_data(ctx3, state_mem.data())) {
|
if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
|
||||||
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
||||||
llama_free(ctx3);
|
llama_free(ctx3);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -182,7 +188,7 @@ int main(int argc, char ** argv) {
|
||||||
{
|
{
|
||||||
// save kv of seq 0
|
// save kv of seq 0
|
||||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
|
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
|
||||||
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
|
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
|
||||||
if (ncopy != seq_store.size()) {
|
if (ncopy != seq_store.size()) {
|
||||||
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||||
llama_free(ctx3);
|
llama_free(ctx3);
|
||||||
|
@ -196,7 +202,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||||
|
|
||||||
// restore kv into seq 1
|
// restore kv into seq 1
|
||||||
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
|
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
|
||||||
if (nset != seq_store.size()) {
|
if (nset != seq_store.size()) {
|
||||||
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||||
llama_free(ctx3);
|
llama_free(ctx3);
|
||||||
|
|
|
@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/
|
||||||
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||||
|
|
||||||
**Features:**
|
**Features:**
|
||||||
* LLM inference of F16 and quantum models on GPU and CPU
|
* LLM inference of F16 and quantized models on GPU and CPU
|
||||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||||
* Parallel decoding with multi-user support
|
* Parallel decoding with multi-user support
|
||||||
* Continuous batching
|
* Continuous batching
|
||||||
|
@ -247,7 +247,7 @@ server:
|
||||||
--host HOST ip address to listen (default: 127.0.0.1)
|
--host HOST ip address to listen (default: 127.0.0.1)
|
||||||
--port PORT port to listen (default: 8080)
|
--port PORT port to listen (default: 8080)
|
||||||
--path PATH path to serve static files from (default: )
|
--path PATH path to serve static files from (default: )
|
||||||
--embedding(s) enable embedding endpoint (default: disabled)
|
--embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
|
||||||
--api-key KEY API key to use for authentication (default: none)
|
--api-key KEY API key to use for authentication (default: none)
|
||||||
--api-key-file FNAME path to file containing API keys (default: none)
|
--api-key-file FNAME path to file containing API keys (default: none)
|
||||||
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
|
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
|
||||||
|
@ -444,7 +444,7 @@ node index.js
|
||||||
|
|
||||||
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
|
`n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
|
||||||
|
|
||||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
|
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
|
||||||
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
||||||
|
|
||||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||||
|
|
|
@ -225,7 +225,7 @@
|
||||||
throw new Error("already running");
|
throw new Error("already running");
|
||||||
}
|
}
|
||||||
controller.value = new AbortController();
|
controller.value = new AbortController();
|
||||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
|
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
|
||||||
const data = chunk.data;
|
const data = chunk.data;
|
||||||
if (data.stop) {
|
if (data.stop) {
|
||||||
while (
|
while (
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
<html>
|
<html>
|
||||||
|
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
||||||
|
@ -132,12 +131,20 @@
|
||||||
align-items: stretch;
|
align-items: stretch;
|
||||||
}
|
}
|
||||||
|
|
||||||
.right {
|
.message-controls {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: row;
|
|
||||||
gap: 0.5em;
|
|
||||||
justify-content: flex-end;
|
justify-content: flex-end;
|
||||||
}
|
}
|
||||||
|
.message-controls > div:nth-child(2) {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 0.5em;
|
||||||
|
}
|
||||||
|
.message-controls > div:nth-child(2) > div {
|
||||||
|
display: flex;
|
||||||
|
margin-left: auto;
|
||||||
|
gap: 0.5em;
|
||||||
|
}
|
||||||
|
|
||||||
fieldset {
|
fieldset {
|
||||||
border: none;
|
border: none;
|
||||||
|
@ -276,6 +283,7 @@
|
||||||
|
|
||||||
import { llama } from './completion.js';
|
import { llama } from './completion.js';
|
||||||
import { SchemaConverter } from './json-schema-to-grammar.mjs';
|
import { SchemaConverter } from './json-schema-to-grammar.mjs';
|
||||||
|
|
||||||
let selected_image = false;
|
let selected_image = false;
|
||||||
var slot_id = -1;
|
var slot_id = -1;
|
||||||
|
|
||||||
|
@ -447,6 +455,9 @@
|
||||||
|
|
||||||
/* END: Support for storing prompt templates and parameters in browsers LocalStorage */
|
/* END: Support for storing prompt templates and parameters in browsers LocalStorage */
|
||||||
|
|
||||||
|
const tts = window.speechSynthesis;
|
||||||
|
const ttsVoice = signal(null)
|
||||||
|
|
||||||
const llamaStats = signal(null)
|
const llamaStats = signal(null)
|
||||||
const controller = signal(null)
|
const controller = signal(null)
|
||||||
|
|
||||||
|
@ -479,7 +490,7 @@
|
||||||
throw new Error("already running");
|
throw new Error("already running");
|
||||||
}
|
}
|
||||||
controller.value = new AbortController();
|
controller.value = new AbortController();
|
||||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: URL.parse('.', document.baseURI).href })) {
|
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: new URL('.', document.baseURI).href })) {
|
||||||
const data = chunk.data;
|
const data = chunk.data;
|
||||||
|
|
||||||
if (data.stop) {
|
if (data.stop) {
|
||||||
|
@ -596,8 +607,51 @@
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
||||||
|
const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null;
|
||||||
function MessageInput() {
|
function MessageInput() {
|
||||||
const message = useSignal("")
|
const message = useSignal("");
|
||||||
|
|
||||||
|
const talkActive = useSignal(false);
|
||||||
|
const sendOnTalk = useSignal(false);
|
||||||
|
const talkStop = (e) => {
|
||||||
|
if (e) e.preventDefault();
|
||||||
|
|
||||||
|
talkActive.value = false;
|
||||||
|
talkRecognition?.stop();
|
||||||
|
}
|
||||||
|
const talk = (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
|
||||||
|
if (talkRecognition)
|
||||||
|
talkRecognition.start();
|
||||||
|
else
|
||||||
|
alert("Speech recognition is not supported by this browser.");
|
||||||
|
}
|
||||||
|
if(talkRecognition) {
|
||||||
|
talkRecognition.onstart = () => {
|
||||||
|
talkActive.value = true;
|
||||||
|
}
|
||||||
|
talkRecognition.onresult = (e) => {
|
||||||
|
if (event.results.length > 0) {
|
||||||
|
message.value = event.results[0][0].transcript;
|
||||||
|
if (sendOnTalk.value) {
|
||||||
|
submit(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
talkRecognition.onspeechend = () => {
|
||||||
|
talkStop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const ttsVoices = useSignal(tts?.getVoices() || []);
|
||||||
|
const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default));
|
||||||
|
if (tts) {
|
||||||
|
tts.onvoiceschanged = () => {
|
||||||
|
ttsVoices.value = tts.getVoices();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const submit = (e) => {
|
const submit = (e) => {
|
||||||
stop(e);
|
stop(e);
|
||||||
|
@ -624,11 +678,45 @@
|
||||||
value="${message}"
|
value="${message}"
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
<div class="right">
|
<div class="message-controls">
|
||||||
<button type="submit" disabled=${generating.value}>Send</button>
|
<div> </div>
|
||||||
<button onclick=${uploadImage}>Upload Image</button>
|
<div>
|
||||||
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
<div>
|
||||||
<button onclick=${reset}>Reset</button>
|
<button type="submit" disabled=${generating.value || talkActive.value}>Send</button>
|
||||||
|
<button disabled=${generating.value || talkActive.value} onclick=${uploadImage}>Upload Image</button>
|
||||||
|
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
||||||
|
<button onclick=${reset}>Reset</button>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
|
||||||
|
e.preventDefault();
|
||||||
|
alert(`STT supported by your browser: ${SpeechRecognition ? 'Yes' : 'No'}\n` +
|
||||||
|
`(TTS and speech recognition are not provided by llama.cpp)\n` +
|
||||||
|
`Note: STT requires HTTPS to work.`);
|
||||||
|
}}>[?]</a>
|
||||||
|
<button disabled=${generating.value} onclick=${talkActive.value ? talkStop : talk}>${talkActive.value ? "Stop Talking" : "Talk"}</button>
|
||||||
|
<div>
|
||||||
|
<input type="checkbox" id="send-on-talk" name="send-on-talk" checked="${sendOnTalk}" onchange=${(e) => sendOnTalk.value = e.target.checked} />
|
||||||
|
<label for="send-on-talk" style="line-height: initial;">Send after talking</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<a href="#" style="cursor: help;" title="Help" onclick=${e => {
|
||||||
|
e.preventDefault();
|
||||||
|
alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`);
|
||||||
|
}}>[?]</a>
|
||||||
|
<label for="tts-voices" style="line-height: initial;">Bot Voice:</label>
|
||||||
|
<select id="tts-voices" name="tts-voices" onchange=${(e) => ttsVoice.value = e.target.value} style="max-width: 100px;">
|
||||||
|
<option value="" selected="${!ttsVoice.value}">None</option>
|
||||||
|
${[
|
||||||
|
...(ttsVoiceDefault.value ? [ttsVoiceDefault.value] : []),
|
||||||
|
...ttsVoices.value.filter(v => !v.default),
|
||||||
|
].map(
|
||||||
|
v => html`<option value="${v.name}" selected="${ttsVoice.value === v.name}">${v.name} (${v.lang}) ${v.default ? '(default)' : ''}</option>`
|
||||||
|
)}
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</form>
|
</form>
|
||||||
`
|
`
|
||||||
|
@ -659,26 +747,86 @@
|
||||||
}
|
}
|
||||||
}, [messages])
|
}, [messages])
|
||||||
|
|
||||||
|
const ttsChatLineActiveIx = useSignal(undefined);
|
||||||
|
const ttsChatLine = (e, ix, msg) => {
|
||||||
|
if (e) e.preventDefault();
|
||||||
|
|
||||||
|
if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return;
|
||||||
|
|
||||||
|
const ttsVoices = tts.getVoices();
|
||||||
|
const voice = ttsVoices.find(v => v.name === ttsVoice.value);
|
||||||
|
if (!voice) return;
|
||||||
|
|
||||||
|
if (ttsChatLineActiveIx.value !== undefined) {
|
||||||
|
tts.cancel();
|
||||||
|
if (ttsChatLineActiveIx.value === ix) {
|
||||||
|
ttsChatLineActiveIx.value = undefined;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ttsChatLineActiveIx.value = ix;
|
||||||
|
let ttsUtter = new SpeechSynthesisUtterance(msg);
|
||||||
|
ttsUtter.voice = voice;
|
||||||
|
ttsUtter.onend = e => {
|
||||||
|
ttsChatLineActiveIx.value = undefined;
|
||||||
|
};
|
||||||
|
tts.speak(ttsUtter);
|
||||||
|
}
|
||||||
|
|
||||||
const isCompletionMode = session.value.type === 'completion'
|
const isCompletionMode = session.value.type === 'completion'
|
||||||
|
|
||||||
|
// Try play the last bot message
|
||||||
|
const lastCharChatLinesIxs = useSignal([]);
|
||||||
|
const lastCharChatLinesIxsOld = useSignal([]);
|
||||||
|
useEffect(() => {
|
||||||
|
if (
|
||||||
|
!isCompletionMode
|
||||||
|
&& lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length
|
||||||
|
&& !generating.value
|
||||||
|
) {
|
||||||
|
const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1];
|
||||||
|
if (ix !== undefined) {
|
||||||
|
const msg = messages[ix];
|
||||||
|
ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value);
|
||||||
|
}
|
||||||
|
}, [generating.value]);
|
||||||
|
|
||||||
const chatLine = ([user, data], index) => {
|
const chatLine = ([user, data], index) => {
|
||||||
let message
|
let message
|
||||||
const isArrayMessage = Array.isArray(data)
|
const isArrayMessage = Array.isArray(data);
|
||||||
|
const text = isArrayMessage ?
|
||||||
|
data.map(msg => msg.content).join('') :
|
||||||
|
data;
|
||||||
if (params.value.n_probs > 0 && isArrayMessage) {
|
if (params.value.n_probs > 0 && isArrayMessage) {
|
||||||
message = html`<${Probabilities} data=${data} />`
|
message = html`<${Probabilities} data=${data} />`
|
||||||
} else {
|
} else {
|
||||||
const text = isArrayMessage ?
|
|
||||||
data.map(msg => msg.content).join('') :
|
|
||||||
data;
|
|
||||||
message = isCompletionMode ?
|
message = isCompletionMode ?
|
||||||
text :
|
text :
|
||||||
html`<${Markdownish} text=${template(text)} />`
|
html`<${Markdownish} text=${template(text)} />`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const fromBot = user && user === '{{char}}';
|
||||||
|
if (fromBot && !lastCharChatLinesIxs.value.includes(index))
|
||||||
|
lastCharChatLinesIxs.value.push(index);
|
||||||
|
|
||||||
if (user) {
|
if (user) {
|
||||||
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
return html`
|
||||||
|
<div>
|
||||||
|
<p key=${index}><strong>${template(user)}:</strong> ${message}</p>
|
||||||
|
${
|
||||||
|
fromBot && ttsVoice.value
|
||||||
|
&& html`<button disabled=${generating.value} onclick=${e => ttsChatLine(e, index, text)} aria-label=${ttsChatLineActiveIx.value === index ? 'Pause' : 'Play'}>${ ttsChatLineActiveIx.value === index ? '⏸️' : '▶️' }</div>`
|
||||||
|
}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
} else {
|
} else {
|
||||||
return isCompletionMode ?
|
return isCompletionMode ?
|
||||||
html`<span key=${index}>${message}</span>` :
|
html`<span key=${index}>${message}</span>` :
|
||||||
html`<p key=${index}>${message}</p>`
|
html`<div><p key=${index}>${message}</p></div>`
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -900,7 +900,7 @@ struct server_context {
|
||||||
|
|
||||||
slot.params.stream = json_value(data, "stream", false);
|
slot.params.stream = json_value(data, "stream", false);
|
||||||
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
|
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||||
slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
|
||||||
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||||
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||||
|
|
|
@ -355,24 +355,6 @@ static json oaicompat_completion_params_parse(
|
||||||
|
|
||||||
llama_params["__oaicompat"] = true;
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
// Map OpenAI parameters to llama.cpp parameters
|
|
||||||
//
|
|
||||||
// For parameters that are defined by the OpenAI documentation (e.g.
|
|
||||||
// temperature), we explicitly specify OpenAI's intended default; we
|
|
||||||
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
|
||||||
//
|
|
||||||
// https://platform.openai.com/docs/api-reference/chat/create
|
|
||||||
llama_sampling_params default_sparams;
|
|
||||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
|
||||||
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
|
||||||
llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
|
|
||||||
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
|
||||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
|
||||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
|
||||||
llama_params["stream"] = json_value(body, "stream", false);
|
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
|
||||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
|
||||||
|
|
||||||
// Apply chat template to the list of messages
|
// Apply chat template to the list of messages
|
||||||
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
||||||
|
|
||||||
|
|
|
@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
||||||
printf(">");
|
printf(">");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
|
GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
|
||||||
}
|
}
|
||||||
|
|
||||||
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
set(TARGET llama-train-text-from-scratch)
|
|
||||||
add_executable(${TARGET} train-text-from-scratch.cpp)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
|
@ -1,27 +0,0 @@
|
||||||
# train-text-from-scratch
|
|
||||||
|
|
||||||
Basic usage instructions:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# get training data
|
|
||||||
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
|
||||||
|
|
||||||
# train
|
|
||||||
./bin/llama-train-text-from-scratch \
|
|
||||||
--vocab-model ../models/ggml-vocab-llama.gguf \
|
|
||||||
--ctx 64 --embd 256 --head 8 --layer 16 \
|
|
||||||
--checkpoint-in chk-shakespeare-256x16-LATEST.gguf \
|
|
||||||
--checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
|
|
||||||
--model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
|
|
||||||
--train-data "shakespeare.txt" \
|
|
||||||
-t 6 -b 16 --seed 1 --adam-iter 256 \
|
|
||||||
--no-checkpointing
|
|
||||||
|
|
||||||
# predict
|
|
||||||
./bin/llama-cli -m ggml-shakespeare-256x16-f32.gguf
|
|
||||||
```
|
|
||||||
|
|
||||||
Output files will be saved every N iterations (config with `--save-every N`).
|
|
||||||
The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
|
|
||||||
|
|
||||||
To train GGUF models just pass them to `--checkpoint-in FN`.
|
|
|
@ -1,499 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# train-text-from-scratch checkpoint --> gguf conversion
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
import numpy as np
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
# gguf constants
|
|
||||||
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
|
||||||
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
|
||||||
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
|
||||||
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
|
||||||
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
|
||||||
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
|
||||||
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
|
||||||
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
|
||||||
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
|
||||||
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
|
||||||
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
|
||||||
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
|
||||||
|
|
||||||
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
|
||||||
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
|
||||||
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
|
||||||
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
|
||||||
|
|
||||||
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
|
|
||||||
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
|
|
||||||
LLM_KV_TRAINING_TYPE = "training.type"
|
|
||||||
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
|
||||||
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
|
||||||
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
|
||||||
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
|
||||||
|
|
||||||
class Tensor:
|
|
||||||
def __init__(self, dtype='f', ne=None):
|
|
||||||
if ne is None:
|
|
||||||
ne = []
|
|
||||||
self.dtype = dtype
|
|
||||||
self.ne = ne
|
|
||||||
self.nbytes = 0
|
|
||||||
if self.dtype == 'f':
|
|
||||||
if len(self.ne) == 0:
|
|
||||||
self.nbytes = 0
|
|
||||||
else:
|
|
||||||
self.nbytes = int(np.prod(self.ne)) * 4
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
assert(nd == len(self.ne))
|
|
||||||
ne = []
|
|
||||||
for d in range(nd):
|
|
||||||
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
ne.append(n)
|
|
||||||
|
|
||||||
assert(tuple(ne) == tuple(self.ne))
|
|
||||||
|
|
||||||
if self.dtype == 'f':
|
|
||||||
assert(dtype == 0)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
|
||||||
|
|
||||||
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
|
||||||
# 32-byte alignment
|
|
||||||
offset += (0 - offset) & 31
|
|
||||||
self.data = data[offset:offset+self.nbytes]
|
|
||||||
offset += self.nbytes
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def max_storage_size(self):
|
|
||||||
result = 0
|
|
||||||
result += 4 # nd
|
|
||||||
result += 4 # namelen
|
|
||||||
result += 4 # dtype
|
|
||||||
result += len(self.ne)*8 # ne
|
|
||||||
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
|
||||||
result += 31 # 32-byte alignment
|
|
||||||
result += self.nbytes
|
|
||||||
return result
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer, name):
|
|
||||||
gguf_writer.add_tensor(
|
|
||||||
name=name,
|
|
||||||
tensor=self.data,
|
|
||||||
raw_shape=np.array(list(reversed(self.ne))),
|
|
||||||
raw_dtype=gguf.GGMLQuantizationType.F32)
|
|
||||||
|
|
||||||
class OptimizationParamsV0:
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
self.type = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_threads = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.delta = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.print_forward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
|
||||||
self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0]; offset += 4 # 32bit-aligned
|
|
||||||
self.adam_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_sched = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_decay = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_alpha = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_beta1 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_beta2 = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_eps_f = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_eps_g = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_n_iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_eps = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_ftol = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_wolfe = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_min_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_max_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_linesearch = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
return offset
|
|
||||||
|
|
||||||
class OptimizationContext:
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
|
||||||
offset += 4
|
|
||||||
|
|
||||||
if self.version == 0:
|
|
||||||
params = OptimizationParamsV0()
|
|
||||||
offset = params.load(data, offset)
|
|
||||||
self.past = params.past
|
|
||||||
self.lbfgs_m = params.lbfgs_m
|
|
||||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
|
||||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
|
||||||
self.type = params.type
|
|
||||||
|
|
||||||
self.adam_m = Tensor('f', [self.nx])
|
|
||||||
self.adam_v = Tensor('f', [self.nx])
|
|
||||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
|
||||||
|
|
||||||
self.lbfgs_x = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_g = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_d = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
|
||||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
|
||||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
|
||||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
|
||||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
|
||||||
|
|
||||||
if self.type == 0:
|
|
||||||
# these tensors are stored, but we don't need their data
|
|
||||||
x = Tensor('f', [self.nx])
|
|
||||||
g = Tensor('f', [self.nx])
|
|
||||||
g2 = Tensor('f', [self.nx])
|
|
||||||
mh = Tensor('f', [self.nx])
|
|
||||||
vh = Tensor('f', [self.nx])
|
|
||||||
|
|
||||||
offset = x.load(data, offset)
|
|
||||||
offset = g.load(data, offset)
|
|
||||||
offset = g2.load(data, offset)
|
|
||||||
offset = self.adam_m.load(data, offset)
|
|
||||||
offset = self.adam_v.load(data, offset)
|
|
||||||
offset = mh.load(data, offset)
|
|
||||||
offset = vh.load(data, offset)
|
|
||||||
offset = self.adam_pf.load(data, offset)
|
|
||||||
|
|
||||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
elif self.type == 1:
|
|
||||||
offset = self.lbfgs_x.load(data, offset)
|
|
||||||
offset = self.lbfgs_xp.load(data, offset)
|
|
||||||
offset = self.lbfgs_g.load(data, offset)
|
|
||||||
offset = self.lbfgs_gp.load(data, offset)
|
|
||||||
offset = self.lbfgs_d.load(data, offset)
|
|
||||||
offset = self.lbfgs_pf.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmal.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmys.load(data, offset)
|
|
||||||
offset = self.lbfgs_lms.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmy.load(data, offset)
|
|
||||||
|
|
||||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError('Unknown optimizer type')
|
|
||||||
|
|
||||||
|
|
||||||
elif self.version == 1:
|
|
||||||
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
|
||||||
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
|
||||||
|
|
||||||
self.adam_m = Tensor('f', [self.nx])
|
|
||||||
self.adam_v = Tensor('f', [self.nx])
|
|
||||||
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
|
||||||
|
|
||||||
self.lbfgs_x = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_xp = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_g = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_gp = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_d = Tensor('f', [self.nx])
|
|
||||||
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
|
||||||
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
|
||||||
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
|
||||||
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
|
||||||
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
|
||||||
|
|
||||||
# forgot to save type in version 1:
|
|
||||||
# guess self.type from number of remaining bytes
|
|
||||||
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
|
||||||
[self.adam_m, self.adam_v]
|
|
||||||
+([self.adam_pf] if (self.past > 0) else [])])
|
|
||||||
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
|
||||||
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
|
||||||
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
|
||||||
self.lbfgs_lmal, self.lbfgs_lmys,
|
|
||||||
self.lbfgs_lms, self.lbfgs_lmy]
|
|
||||||
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
|
||||||
# due to alignment padding the size might not by exact
|
|
||||||
# but the difference in size for both types is significant,
|
|
||||||
# so we can just use whichever is closest
|
|
||||||
remaining = len(data) - offset
|
|
||||||
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
|
||||||
self.type = 0
|
|
||||||
else:
|
|
||||||
self.type = 1
|
|
||||||
|
|
||||||
if self.type == 0:
|
|
||||||
offset = self.adam_m.load(data, offset)
|
|
||||||
offset = self.adam_v.load(data, offset)
|
|
||||||
offset = self.adam_pf.load(data,offset)
|
|
||||||
|
|
||||||
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
elif self.type == 1:
|
|
||||||
offset = self.lbfgs_x.load(data, offset)
|
|
||||||
offset = self.lbfgs_xp.load(data, offset)
|
|
||||||
offset = self.lbfgs_g.load(data, offset)
|
|
||||||
offset = self.lbfgs_gp.load(data, offset)
|
|
||||||
offset = self.lbfgs_d.load(data, offset)
|
|
||||||
offset = self.lbfgs_pf.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmal.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmys.load(data, offset)
|
|
||||||
offset = self.lbfgs_lms.load(data, offset)
|
|
||||||
offset = self.lbfgs_lmy.load(data, offset)
|
|
||||||
|
|
||||||
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError('Invalid version of checkpoint file')
|
|
||||||
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
|
||||||
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
|
||||||
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
|
||||||
|
|
||||||
if self.type == 0:
|
|
||||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
|
||||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
|
||||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
|
||||||
|
|
||||||
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
|
||||||
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
|
||||||
if self.past > 0:
|
|
||||||
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
|
||||||
|
|
||||||
elif self.type == 1:
|
|
||||||
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
|
||||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
|
||||||
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
|
||||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
|
||||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
|
||||||
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
|
||||||
|
|
||||||
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
|
||||||
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
|
||||||
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
|
||||||
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
|
||||||
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
|
||||||
if self.past > 0:
|
|
||||||
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
|
||||||
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
|
||||||
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
|
||||||
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
|
||||||
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
|
||||||
else:
|
|
||||||
raise ValueError('Unknown optimizer type')
|
|
||||||
|
|
||||||
class ModelParams:
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def get_n_ff(self):
|
|
||||||
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
|
||||||
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
# self.n_vocab not saved
|
|
||||||
gguf_writer.add_embedding_length(self.n_embd)
|
|
||||||
gguf_writer.add_head_count(self.n_head)
|
|
||||||
gguf_writer.add_block_count(self.n_layer)
|
|
||||||
gguf_writer.add_rope_dimension_count(self.n_rot)
|
|
||||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
|
||||||
|
|
||||||
def tensor_name(key, bid=None):
|
|
||||||
return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
|
|
||||||
|
|
||||||
class Layer:
|
|
||||||
def __init__(self, params, bid):
|
|
||||||
self.bid = bid
|
|
||||||
self.att_norm = Tensor('f', [params.n_embd])
|
|
||||||
self.wq = Tensor('f', [params.n_embd, params.n_embd])
|
|
||||||
self.wk = Tensor('f', [params.n_embd, params.n_embd])
|
|
||||||
self.wv = Tensor('f', [params.n_embd, params.n_embd])
|
|
||||||
self.wo = Tensor('f', [params.n_embd, params.n_embd])
|
|
||||||
self.ffn_norm = Tensor('f', [params.n_embd])
|
|
||||||
self.w1 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
|
||||||
self.w2 = Tensor('f', [params.get_n_ff(), params.n_embd])
|
|
||||||
self.w3 = Tensor('f', [params.n_embd, params.get_n_ff()])
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
offset = self.att_norm.load(data, offset)
|
|
||||||
offset = self.wq.load(data, offset)
|
|
||||||
offset = self.wk.load(data, offset)
|
|
||||||
offset = self.wv.load(data, offset)
|
|
||||||
offset = self.wo.load(data, offset)
|
|
||||||
offset = self.ffn_norm.load(data, offset)
|
|
||||||
offset = self.w1.load(data, offset)
|
|
||||||
offset = self.w2.load(data, offset)
|
|
||||||
offset = self.w3.load(data, offset)
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
|
|
||||||
self.wq.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid))
|
|
||||||
self.wk.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid))
|
|
||||||
self.wv.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid))
|
|
||||||
self.wo.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid))
|
|
||||||
self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid))
|
|
||||||
self.w1.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid))
|
|
||||||
self.w2.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid))
|
|
||||||
self.w3.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid))
|
|
||||||
|
|
||||||
class Model:
|
|
||||||
def __init__(self):
|
|
||||||
self.params = ModelParams()
|
|
||||||
self.layers = []
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
offset = self.params.load(data, offset)
|
|
||||||
|
|
||||||
self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
|
||||||
self.norm = Tensor('f', [self.params.n_embd])
|
|
||||||
self.output = Tensor('f', [self.params.n_embd, self.params.n_vocab])
|
|
||||||
|
|
||||||
offset = self.tok_embd.load(data, offset)
|
|
||||||
offset = self.norm.load(data, offset)
|
|
||||||
offset = self.output.load(data, offset)
|
|
||||||
|
|
||||||
self.layers.clear()
|
|
||||||
for bid in range(self.params.n_layer):
|
|
||||||
layer = Layer(self.params, bid)
|
|
||||||
offset = layer.load(data, offset)
|
|
||||||
self.layers.append(layer)
|
|
||||||
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
self.params.save_gguf(gguf_writer)
|
|
||||||
|
|
||||||
self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
|
|
||||||
self.norm.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
|
|
||||||
self.output.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
|
|
||||||
|
|
||||||
for layer in self.layers:
|
|
||||||
layer.save_gguf(gguf_writer)
|
|
||||||
|
|
||||||
class Checkpoint:
|
|
||||||
def __init__(self):
|
|
||||||
self.model = Model()
|
|
||||||
self.opt_ctx = OptimizationContext()
|
|
||||||
|
|
||||||
def load(self, data, offset):
|
|
||||||
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
|
||||||
if magic != b'ggcp':
|
|
||||||
raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
|
|
||||||
|
|
||||||
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
if self.version != 0:
|
|
||||||
raise ValueError('Invalid version of checkpoint file')
|
|
||||||
|
|
||||||
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
|
||||||
|
|
||||||
offset = self.model.load(data, offset)
|
|
||||||
offset = self.opt_ctx.load(data, offset)
|
|
||||||
|
|
||||||
return offset
|
|
||||||
|
|
||||||
def save_gguf(self, gguf_writer):
|
|
||||||
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
|
||||||
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
|
||||||
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
|
||||||
self.model.save_gguf(gguf_writer)
|
|
||||||
self.opt_ctx.save_gguf(gguf_writer)
|
|
||||||
|
|
||||||
def handle_args():
|
|
||||||
parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
|
|
||||||
parser.add_argument('--input', '-i', type = Path, help = 'Input train checkpoint filename', required=True)
|
|
||||||
parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
def main():
|
|
||||||
cfg = handle_args()
|
|
||||||
data = np.memmap(cfg.input, mode = 'r')
|
|
||||||
chk = Checkpoint()
|
|
||||||
offset = 0
|
|
||||||
offset = chk.load(data, offset)
|
|
||||||
# we should have read all available data
|
|
||||||
assert(offset == len(data))
|
|
||||||
|
|
||||||
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
|
||||||
chk.save_gguf(gguf_writer)
|
|
||||||
print(" gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print(" gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
print(" gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
File diff suppressed because it is too large
Load diff
20
flake.lock
generated
20
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1719994518,
|
"lastModified": 1722555600,
|
||||||
"narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=",
|
"narHash": "sha256-XOQkdLafnb/p9ij77byFQjDf5m5QYl9b2REiVClC+x4=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7",
|
"rev": "8471fe90ad337a8074e957b69ca4d0089218391d",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1720768451,
|
"lastModified": 1722421184,
|
||||||
"narHash": "sha256-EYekUHJE2gxeo2pM/zM9Wlqw1Uw2XTJXOSAO79ksc4Y=",
|
"narHash": "sha256-/DJBI6trCeVnasdjUo9pbnodCLZcFqnVZiLUfqLH4jA=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "7e7c39ea35c5cdd002cd4588b03a3fb9ece6fad9",
|
"rev": "9f918d616c5321ad374ae6cb5ea89c9e04bf3e58",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -36,14 +36,14 @@
|
||||||
},
|
},
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1719876945,
|
"lastModified": 1722555339,
|
||||||
"narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=",
|
"narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
|
||||||
"type": "tarball",
|
"type": "tarball",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
|
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"type": "tarball",
|
"type": "tarball",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
|
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": {
|
"root": {
|
||||||
|
|
|
@ -50,9 +50,15 @@ else()
|
||||||
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
|
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (CMAKE_CROSSCOMPILING)
|
||||||
|
set(GGML_NATIVE_DEFAULT OFF)
|
||||||
|
else()
|
||||||
|
set(GGML_NATIVE_DEFAULT ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
# general
|
# general
|
||||||
option(GGML_STATIC "ggml: static link libraries" OFF)
|
option(GGML_STATIC "ggml: static link libraries" OFF)
|
||||||
option(GGML_NATIVE "ggml: enable -march=native flag" ON)
|
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
|
||||||
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
||||||
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
||||||
|
|
||||||
|
@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
|
||||||
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
|
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
# instruction set specific
|
# instruction set specific
|
||||||
if (GGML_NATIVE)
|
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
|
||||||
set(INS_ENB OFF)
|
set(INS_ENB OFF)
|
||||||
else()
|
else()
|
||||||
set(INS_ENB ON)
|
set(INS_ENB ON)
|
||||||
|
@ -107,6 +113,7 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||||
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
|
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
|
||||||
|
|
||||||
option(GGML_CUDA "ggml: use CUDA" OFF)
|
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||||
|
option(GGML_MUSA "ggml: use MUSA" OFF)
|
||||||
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
||||||
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
||||||
|
@ -200,6 +207,7 @@ set(GGML_PUBLIC_HEADERS
|
||||||
include/ggml-alloc.h
|
include/ggml-alloc.h
|
||||||
include/ggml-backend.h
|
include/ggml-backend.h
|
||||||
include/ggml-blas.h
|
include/ggml-blas.h
|
||||||
|
include/ggml-cann.h
|
||||||
include/ggml-cuda.h
|
include/ggml-cuda.h
|
||||||
include/ggml.h
|
include/ggml.h
|
||||||
include/ggml-kompute.h
|
include/ggml-kompute.h
|
||||||
|
|
|
@ -6,6 +6,9 @@
|
||||||
#ifdef GGML_USE_HIPBLAS
|
#ifdef GGML_USE_HIPBLAS
|
||||||
#define GGML_CUDA_NAME "ROCm"
|
#define GGML_CUDA_NAME "ROCm"
|
||||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||||
|
#elif defined(GGML_USE_MUSA)
|
||||||
|
#define GGML_CUDA_NAME "MUSA"
|
||||||
|
#define GGML_CUBLAS_NAME "muBLAS"
|
||||||
#else
|
#else
|
||||||
#define GGML_CUDA_NAME "CUDA"
|
#define GGML_CUDA_NAME "CUDA"
|
||||||
#define GGML_CUBLAS_NAME "cuBLAS"
|
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||||
|
|
|
@ -254,18 +254,8 @@
|
||||||
|
|
||||||
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
||||||
|
|
||||||
#define GGML_ASSERT(x) \
|
|
||||||
do { \
|
|
||||||
if (!(x)) { \
|
|
||||||
fflush(stdout); \
|
|
||||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
|
||||||
ggml_print_backtrace(); \
|
|
||||||
abort(); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
|
||||||
#elif defined(__GNUC__)
|
#elif defined(__GNUC__)
|
||||||
#define GGML_UNREACHABLE() __builtin_unreachable()
|
#define GGML_UNREACHABLE() __builtin_unreachable()
|
||||||
#elif defined(_MSC_VER)
|
#elif defined(_MSC_VER)
|
||||||
|
@ -274,6 +264,17 @@
|
||||||
#define GGML_UNREACHABLE() ((void) 0)
|
#define GGML_UNREACHABLE() ((void) 0)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
#define GGML_NORETURN [[noreturn]]
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#define GGML_NORETURN __declspec(noreturn)
|
||||||
|
#else
|
||||||
|
#define GGML_NORETURN _Noreturn
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
|
||||||
|
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
|
||||||
|
|
||||||
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
||||||
// main purpose is to reduce code duplication and improve readability.
|
// main purpose is to reduce code duplication and improve readability.
|
||||||
//
|
//
|
||||||
|
@ -322,6 +323,9 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
|
||||||
|
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
|
||||||
|
|
||||||
enum ggml_status {
|
enum ggml_status {
|
||||||
GGML_STATUS_ALLOC_FAILED = -2,
|
GGML_STATUS_ALLOC_FAILED = -2,
|
||||||
GGML_STATUS_FAILED = -1,
|
GGML_STATUS_FAILED = -1,
|
||||||
|
@ -345,6 +349,7 @@ extern "C" {
|
||||||
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
||||||
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
||||||
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
||||||
|
GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
|
||||||
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
|
@ -636,8 +641,11 @@ extern "C" {
|
||||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||||
};
|
};
|
||||||
|
|
||||||
|
typedef uint32_t ggml_bitset_t;
|
||||||
|
|
||||||
struct ggml_hash_set {
|
struct ggml_hash_set {
|
||||||
size_t size;
|
size_t size;
|
||||||
|
ggml_bitset_t * used;
|
||||||
struct ggml_tensor ** keys;
|
struct ggml_tensor ** keys;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -651,7 +659,7 @@ extern "C" {
|
||||||
struct ggml_tensor ** grads;
|
struct ggml_tensor ** grads;
|
||||||
struct ggml_tensor ** leafs;
|
struct ggml_tensor ** leafs;
|
||||||
|
|
||||||
struct ggml_hash_set visited_hash_table;
|
struct ggml_hash_set visited_hash_set;
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
enum ggml_cgraph_eval_order order;
|
||||||
};
|
};
|
||||||
|
@ -698,8 +706,6 @@ extern "C" {
|
||||||
GGML_API int64_t ggml_cycles(void);
|
GGML_API int64_t ggml_cycles(void);
|
||||||
GGML_API int64_t ggml_cycles_per_ms(void);
|
GGML_API int64_t ggml_cycles_per_ms(void);
|
||||||
|
|
||||||
GGML_API void ggml_print_backtrace(void);
|
|
||||||
|
|
||||||
// accepts a UTF-8 path, even on Windows
|
// accepts a UTF-8 path, even on Windows
|
||||||
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
|
||||||
|
|
||||||
|
@ -2005,8 +2011,8 @@ extern "C" {
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||||
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
@ -2400,6 +2406,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_API int ggml_cpu_has_vsx (void);
|
||||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||||
GGML_API int ggml_cpu_has_cann (void);
|
GGML_API int ggml_cpu_has_cann (void);
|
||||||
|
GGML_API int ggml_cpu_has_llamafile (void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
|
|
@ -139,6 +139,17 @@ if (GGML_METAL)
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_MUSA)
|
||||||
|
set(CMAKE_C_COMPILER clang)
|
||||||
|
set(CMAKE_C_EXTENSIONS OFF)
|
||||||
|
set(CMAKE_CXX_COMPILER clang++)
|
||||||
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||||
|
|
||||||
|
set(GGML_CUDA ON)
|
||||||
|
|
||||||
|
list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GGML_OPENMP)
|
if (GGML_OPENMP)
|
||||||
find_package(OpenMP)
|
find_package(OpenMP)
|
||||||
if (OpenMP_FOUND)
|
if (OpenMP_FOUND)
|
||||||
|
@ -147,6 +158,11 @@ if (GGML_OPENMP)
|
||||||
add_compile_definitions(GGML_USE_OPENMP)
|
add_compile_definitions(GGML_USE_OPENMP)
|
||||||
|
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||||
|
|
||||||
|
if (GGML_MUSA)
|
||||||
|
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
|
||||||
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(WARNING "OpenMP not found")
|
message(WARNING "OpenMP not found")
|
||||||
endif()
|
endif()
|
||||||
|
@ -249,7 +265,13 @@ endif()
|
||||||
if (GGML_CUDA)
|
if (GGML_CUDA)
|
||||||
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||||
|
|
||||||
find_package(CUDAToolkit)
|
if (GGML_MUSA)
|
||||||
|
list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
|
||||||
|
find_package(MUSAToolkit)
|
||||||
|
set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
|
||||||
|
else()
|
||||||
|
find_package(CUDAToolkit)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (CUDAToolkit_FOUND)
|
if (CUDAToolkit_FOUND)
|
||||||
message(STATUS "CUDA found")
|
message(STATUS "CUDA found")
|
||||||
|
@ -268,7 +290,11 @@ if (GGML_CUDA)
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
||||||
enable_language(CUDA)
|
if (GGML_MUSA)
|
||||||
|
set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
|
||||||
|
else()
|
||||||
|
enable_language(CUDA)
|
||||||
|
endif()
|
||||||
|
|
||||||
file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
|
file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
|
||||||
list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
|
list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
|
||||||
|
@ -332,21 +358,40 @@ if (GGML_CUDA)
|
||||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_MUSA)
|
||||||
|
set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
|
||||||
|
foreach(SOURCE ${GGML_SOURCES_CUDA})
|
||||||
|
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GGML_STATIC)
|
if (GGML_STATIC)
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||||
else ()
|
else ()
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
if (GGML_MUSA)
|
||||||
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
|
||||||
|
else()
|
||||||
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
if (GGML_MUSA)
|
||||||
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
|
||||||
|
else()
|
||||||
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_CUDA_NO_VMM)
|
if (GGML_CUDA_NO_VMM)
|
||||||
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
||||||
else()
|
else()
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
if (GGML_MUSA)
|
||||||
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
|
||||||
|
else()
|
||||||
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(WARNING "CUDA not found")
|
message(WARNING "CUDA not found")
|
||||||
|
@ -467,15 +512,18 @@ if (GGML_SYCL)
|
||||||
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
|
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if ( NOT DEFINED ENV{ONEAPI_ROOT})
|
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
||||||
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
|
if ( DEFINED ENV{ONEAPI_ROOT})
|
||||||
|
message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
|
||||||
|
elseif(SUPPORTS_SYCL)
|
||||||
|
message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
|
||||||
|
If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
|
||||||
|
source /opt/intel/oneapi/setvars.sh")
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
|
||||||
endif()
|
endif()
|
||||||
#todo: AOT
|
|
||||||
|
|
||||||
find_package(IntelSYCL REQUIRED)
|
|
||||||
find_package(MKL REQUIRED)
|
|
||||||
|
|
||||||
message(STATUS "SYCL found")
|
message(STATUS "SYCL found")
|
||||||
|
#todo: AOT
|
||||||
|
|
||||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
|
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
|
||||||
|
|
||||||
|
@ -487,11 +535,9 @@ if (GGML_SYCL)
|
||||||
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
|
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_compile_options(-I./) #include DPCT
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
|
||||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
|
||||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||||
else()
|
else()
|
||||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||||
|
@ -504,14 +550,14 @@ if (GGML_SYCL)
|
||||||
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
|
find_package(IntelSYCL REQUIRED)
|
||||||
|
find_package(MKL REQUIRED)
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||||
else()
|
else()
|
||||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
|
||||||
|
|
||||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -803,11 +849,6 @@ if (GGML_CANN)
|
||||||
${CANN_INSTALL_DIR}/acllib/include
|
${CANN_INSTALL_DIR}/acllib/include
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: find libs
|
|
||||||
link_directories(
|
|
||||||
${CANN_INSTALL_DIR}/lib64
|
|
||||||
)
|
|
||||||
|
|
||||||
add_subdirectory(ggml-cann/kernels)
|
add_subdirectory(ggml-cann/kernels)
|
||||||
list(APPEND CANN_LIBRARIES
|
list(APPEND CANN_LIBRARIES
|
||||||
ascendcl
|
ascendcl
|
||||||
|
@ -826,6 +867,7 @@ if (GGML_CANN)
|
||||||
|
|
||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
|
||||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
|
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
|
||||||
|
set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64)
|
||||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
|
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
|
@ -856,8 +898,10 @@ function(get_flags CCID CCVER)
|
||||||
set(C_FLAGS -Wdouble-promotion)
|
set(C_FLAGS -Wdouble-promotion)
|
||||||
set(CXX_FLAGS -Wno-array-bounds)
|
set(CXX_FLAGS -Wno-array-bounds)
|
||||||
|
|
||||||
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
if (NOT GGML_MUSA)
|
||||||
list(APPEND CXX_FLAGS -Wno-format-truncation)
|
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
||||||
|
list(APPEND CXX_FLAGS -Wno-format-truncation)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||||
|
@ -1263,6 +1307,7 @@ endif()
|
||||||
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
|
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
|
||||||
target_include_directories(ggml PUBLIC ../include)
|
target_include_directories(ggml PUBLIC ../include)
|
||||||
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
|
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
|
||||||
|
target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
|
||||||
target_compile_features (ggml PRIVATE c_std_11) # don't bump
|
target_compile_features (ggml PRIVATE c_std_11) # don't bump
|
||||||
|
|
||||||
target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
|
target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
|
||||||
|
|
|
@ -384,15 +384,15 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -496,12 +496,12 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -613,8 +613,8 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -680,12 +680,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
||||||
"performance");
|
"performance");
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon()) {
|
else if (ggml_cpu_has_neon()) {
|
||||||
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
||||||
"quantization format for optimal performance");
|
"quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
|
@ -745,15 +745,15 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -1266,12 +1266,12 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -1727,8 +1727,8 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
|
||||||
if (svcntw() == 8) {
|
if (ggml_sve_cnt_b == QK8_0) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
|
@ -2139,12 +2139,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
||||||
GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
|
GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
"__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
|
||||||
"performance");
|
"performance");
|
||||||
}
|
}
|
||||||
else if (ggml_cpu_has_neon()) {
|
else if (ggml_cpu_has_neon()) {
|
||||||
GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
"__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
|
||||||
"quantization format for optimal performance");
|
"quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
|
|
|
@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
||||||
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
||||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
||||||
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
||||||
GGML_ASSERT(!"not enough space in the buffer");
|
GGML_ABORT("not enough space in the buffer");
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
|
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
|
||||||
|
@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT(!"out of allocated_tensors");
|
GGML_ABORT("out of allocated_tensors");
|
||||||
}
|
}
|
||||||
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
|
GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
|
||||||
GGML_ASSERT(!"tensor not found");
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||||
// this should never happen
|
// this should never happen
|
||||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||||
__func__, size, max_avail);
|
__func__, size, max_avail);
|
||||||
GGML_ASSERT(!"not enough space in the buffer");
|
GGML_ABORT("not enough space in the buffer");
|
||||||
GGML_UNREACHABLE();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
free(galloc->hash_set.keys);
|
ggml_hash_set_free(&galloc->hash_set);
|
||||||
free(galloc->hash_values);
|
free(galloc->hash_values);
|
||||||
free(galloc->bufts);
|
free(galloc->bufts);
|
||||||
free(galloc->buffers);
|
free(galloc->buffers);
|
||||||
|
@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||||
typedef struct ggml_gallocr * ggml_gallocr_t;
|
typedef struct ggml_gallocr * ggml_gallocr_t;
|
||||||
|
|
||||||
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||||
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
|
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
|
||||||
return &galloc->hash_values[i];
|
return &galloc->hash_values[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
||||||
|
|
||||||
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||||
// clear hash tables
|
// clear hash tables
|
||||||
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
ggml_hash_set_reset(&galloc->hash_set);
|
||||||
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
||||||
|
|
||||||
// allocate leafs
|
// allocate leafs
|
||||||
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
||||||
|
@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
||||||
size_t hash_size = graph->visited_hash_table.size;
|
size_t min_hash_size = graph->n_nodes + graph->n_leafs;
|
||||||
|
// add 25% margin to avoid hash collisions
|
||||||
|
min_hash_size += min_hash_size / 4;
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
if (galloc->hash_set.size < hash_size) {
|
if (galloc->hash_set.size < min_hash_size) {
|
||||||
free(galloc->hash_set.keys);
|
ggml_hash_set_free(&galloc->hash_set);
|
||||||
free(galloc->hash_values);
|
galloc->hash_set = ggml_hash_set_new(min_hash_size);
|
||||||
galloc->hash_set.size = hash_size;
|
|
||||||
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
|
||||||
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
|
||||||
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
||||||
|
|
||||||
|
free(galloc->hash_values);
|
||||||
|
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
|
||||||
GGML_ASSERT(galloc->hash_values != NULL);
|
GGML_ASSERT(galloc->hash_values != NULL);
|
||||||
} else {
|
|
||||||
// reset hash table
|
|
||||||
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
|
|
||||||
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset allocators
|
// reset allocators
|
||||||
|
@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||||
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
|
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||||
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
|
|
||||||
return talloc->size_max >= node_size;
|
return talloc->size_max >= node_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1055,11 +1055,10 @@ struct ggml_backend_sched {
|
||||||
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
|
||||||
ggml_gallocr_t galloc;
|
ggml_gallocr_t galloc;
|
||||||
|
|
||||||
// hash keys of the nodes in the graph
|
// hash map of the nodes in the graph
|
||||||
struct ggml_hash_set hash_set;
|
struct ggml_hash_set hash_set;
|
||||||
// hash values
|
int * hv_tensor_backend_ids; // [hash_set.size]
|
||||||
int * tensor_backend_id;
|
struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
|
||||||
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
|
|
||||||
|
|
||||||
int * node_backend_ids; // [graph_size]
|
int * node_backend_ids; // [graph_size]
|
||||||
int * leaf_backend_ids; // [graph_size]
|
int * leaf_backend_ids; // [graph_size]
|
||||||
|
@ -1068,7 +1067,7 @@ struct ggml_backend_sched {
|
||||||
int * prev_leaf_backend_ids; // [graph_size]
|
int * prev_leaf_backend_ids; // [graph_size]
|
||||||
|
|
||||||
// copy of the graph with modified inputs
|
// copy of the graph with modified inputs
|
||||||
struct ggml_cgraph * graph;
|
struct ggml_cgraph graph;
|
||||||
|
|
||||||
// graph splits
|
// graph splits
|
||||||
struct ggml_backend_sched_split * splits;
|
struct ggml_backend_sched_split * splits;
|
||||||
|
@ -1087,19 +1086,16 @@ struct ggml_backend_sched {
|
||||||
ggml_backend_sched_eval_callback callback_eval;
|
ggml_backend_sched_eval_callback callback_eval;
|
||||||
void * callback_eval_user_data;
|
void * callback_eval_user_data;
|
||||||
|
|
||||||
bool debug;
|
char * context_buffer;
|
||||||
|
size_t context_buffer_size;
|
||||||
|
|
||||||
// align context_buffer to GGML_MEM_ALIGN
|
bool debug;
|
||||||
#ifdef _MSC_VER
|
|
||||||
__declspec(align(GGML_MEM_ALIGN))
|
|
||||||
#else
|
|
||||||
__attribute__((aligned(GGML_MEM_ALIGN)))
|
|
||||||
#endif
|
|
||||||
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
|
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
|
||||||
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
|
#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
|
||||||
|
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
|
||||||
|
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
|
||||||
|
|
||||||
// returns the priority of the backend, lower id is higher priority
|
// returns the priority of the backend, lower id is higher priority
|
||||||
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||||
|
@ -1169,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
return cur_backend_id;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// assign nodes that use weights to the backend of the weights
|
|
||||||
// operations with weights are preferably run on the same backend as the weights
|
// operations with weights are preferably run on the same backend as the weights
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
const struct ggml_tensor * src = tensor->src[i];
|
const struct ggml_tensor * src = tensor->src[i];
|
||||||
|
@ -1275,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
sched->is_reset = false;
|
sched->is_reset = false;
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ sizeof(sched->context_buffer),
|
/* .mem_size = */ sched->context_buffer_size,
|
||||||
/* .mem_buffer = */ sched->context_buffer,
|
/* .mem_buffer = */ sched->context_buffer,
|
||||||
/* .no_alloc = */ true
|
/* .no_alloc = */ true
|
||||||
};
|
};
|
||||||
|
@ -1284,39 +1279,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
sched->ctx = ggml_init(params);
|
sched->ctx = ggml_init(params);
|
||||||
if (sched->ctx == NULL) {
|
if (sched->ctx == NULL) {
|
||||||
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
GGML_ABORT("%s: failed to initialize context\n", __func__);
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// pass 1: assign backends to ops with pre-allocated inputs
|
// pass 1: assign backends to ops with pre-allocated inputs
|
||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
int * leaf_backend_id = &tensor_backend_id(leaf);
|
int * leaf_backend_id = &tensor_backend_id(leaf);
|
||||||
if (*leaf_backend_id != -1) {
|
// do not overwrite user assignments
|
||||||
// do not overwrite user assignments
|
if (*leaf_backend_id == -1) {
|
||||||
continue;
|
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||||
}
|
}
|
||||||
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
int * node_backend_id = &tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (*node_backend_id != -1) {
|
// do not overwrite user assignments
|
||||||
// do not overwrite user assignments
|
if (*node_backend_id == -1) {
|
||||||
continue;
|
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||||
}
|
|
||||||
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
#if 0
|
||||||
// src
|
// src
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
if (node->op == GGML_OP_NONE) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
|
||||||
if (src == NULL) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int * src_backend_id = &tensor_backend_id(src);
|
|
||||||
if (*src_backend_id == -1) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
if (src == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int * src_backend_id = &tensor_backend_id(src);
|
||||||
|
if (*src_backend_id == -1) {
|
||||||
|
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1488,12 +1487,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// pass 4: split graph, find tensors that need to be copied
|
// pass 5: split graph, find tensors that need to be copied
|
||||||
{
|
{
|
||||||
int i_split = 0;
|
int i_split = 0;
|
||||||
struct ggml_backend_sched_split * split = &sched->splits[0];
|
struct ggml_backend_sched_split * split = &sched->splits[0];
|
||||||
// find the backend of the first split, skipping view ops
|
// find the backend of the first split, skipping view ops
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
int i = 0;
|
||||||
|
for (; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (!ggml_is_view_op(node->op)) {
|
if (!ggml_is_view_op(node->op)) {
|
||||||
split->backend_id = tensor_backend_id(node);
|
split->backend_id = tensor_backend_id(node);
|
||||||
|
@ -1502,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
split->i_start = 0;
|
split->i_start = 0;
|
||||||
split->n_inputs = 0;
|
split->n_inputs = 0;
|
||||||
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
|
||||||
int cur_backend_id = split->backend_id;
|
int cur_backend_id = split->backend_id;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
|
@ -1513,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
const int node_backend_id = tensor_backend_id(node);
|
const int node_backend_id = tensor_backend_id(node);
|
||||||
|
|
||||||
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
assert(node_backend_id != -1); // all nodes should be assigned by now
|
||||||
|
|
||||||
// check if we should start a new split based on the sources of the current node
|
// check if we should start a new split based on the sources of the current node
|
||||||
bool need_new_split = false;
|
bool need_new_split = false;
|
||||||
|
@ -1527,7 +1526,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// by starting a new split, the memory of the previously offloaded weights can be reused
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int src_backend_id = tensor_backend_id(src);
|
||||||
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
if (src_backend_id != cur_backend_id) {
|
||||||
need_new_split = true;
|
need_new_split = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1536,9 +1535,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// FIXME: count the number of inputs instead of only checking when full
|
// FIXME: count the number of inputs instead of only checking when full
|
||||||
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||||
const size_t id = hash_id(src);
|
const size_t id = hash_id(src);
|
||||||
int src_backend_id = sched->tensor_backend_id[id];
|
int src_backend_id = sched->hv_tensor_backend_ids[id];
|
||||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
||||||
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
|
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
|
||||||
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||||
need_new_split = true;
|
need_new_split = true;
|
||||||
break;
|
break;
|
||||||
|
@ -1570,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int src_backend_id = tensor_backend_id(src);
|
size_t src_id = hash_id(src);
|
||||||
|
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
|
||||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||||
|
|
||||||
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||||
size_t id = hash_id(src);
|
if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
|
||||||
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
|
||||||
ggml_backend_t backend = sched->backends[src_backend_id];
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * tensor_copy;
|
struct ggml_tensor * tensor_copy;
|
||||||
|
@ -1589,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_set_input(tensor_copy);
|
ggml_set_input(tensor_copy);
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
}
|
||||||
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_graph_inputs = sched->n_graph_inputs++;
|
int n_graph_inputs = sched->n_graph_inputs++;
|
||||||
|
@ -1598,11 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
|
if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
|
||||||
if (src_backend_id != cur_backend_id && !supported) {
|
|
||||||
// create a copy of the input in the split's backend
|
// create a copy of the input in the split's backend
|
||||||
const size_t id = hash_id(src);
|
if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
|
||||||
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
|
||||||
ggml_backend_t backend = sched->backends[cur_backend_id];
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||||
|
@ -1611,14 +1608,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_set_input(tensor_copy);
|
ggml_set_input(tensor_copy);
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
}
|
||||||
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_inputs = split->n_inputs++;
|
int n_inputs = split->n_inputs++;
|
||||||
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||||
split->inputs[n_inputs] = src;
|
split->inputs[n_inputs] = src;
|
||||||
}
|
}
|
||||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1630,7 +1627,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_backend_sched_print_assignments(sched, graph);
|
ggml_backend_sched_print_assignments(sched, graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
// swap node_backend_ids and leaf_backend_ids and prevs
|
// swap node_backend_ids and leaf _backend_ids with prevs
|
||||||
{
|
{
|
||||||
int * tmp = sched->node_backend_ids;
|
int * tmp = sched->node_backend_ids;
|
||||||
sched->node_backend_ids = sched->prev_node_backend_ids;
|
sched->node_backend_ids = sched->prev_node_backend_ids;
|
||||||
|
@ -1641,9 +1638,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
sched->prev_leaf_backend_ids = tmp;
|
sched->prev_leaf_backend_ids = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
// create copies of the graph for each split
|
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
// TODO: avoid this copy
|
if (sched->graph.size < graph_size) {
|
||||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
sched->graph.size = graph_size;
|
||||||
|
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
||||||
|
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
|
||||||
|
GGML_ASSERT(sched->graph.nodes != NULL);
|
||||||
|
GGML_ASSERT(sched->graph.leafs != NULL);
|
||||||
|
}
|
||||||
|
sched->graph.n_nodes = 0;
|
||||||
|
sched->graph.n_leafs = 0;
|
||||||
|
|
||||||
|
struct ggml_cgraph * graph_copy = &sched->graph;
|
||||||
|
|
||||||
for (int i = 0; i < sched->n_splits; i++) {
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||||
|
@ -1654,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
const size_t input_id = hash_id(input);
|
const size_t input_id = hash_id(input);
|
||||||
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
|
||||||
|
|
||||||
// add a dependency to the input source so that it is not freed before the copy is done
|
// add a dependency to the input source so that it is not freed before the copy is done
|
||||||
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
||||||
input_dep->src[0] = input;
|
input_dep->src[0] = input;
|
||||||
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
||||||
|
|
||||||
// add a dependency to the input copy so that it is allocated at the start of the split
|
// add a dependency to the input copy so that it is allocated at the start of the split
|
||||||
|
@ -1681,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
size_t id = hash_id(input);
|
size_t id = hash_id(input);
|
||||||
int backend_id = tensor_backend_id(input);
|
int backend_id = tensor_backend_id(input);
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||||
}
|
}
|
||||||
|
@ -1694,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
size_t id = hash_id(input);
|
size_t id = hash_id(input);
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||||
}
|
}
|
||||||
|
@ -1708,13 +1715,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||||
}
|
}
|
||||||
|
|
||||||
sched->graph = graph_copy;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
bool backend_ids_changed = false;
|
bool backend_ids_changed = false;
|
||||||
for (int i = 0; i < sched->graph->n_nodes; i++) {
|
for (int i = 0; i < sched->graph.n_nodes; i++) {
|
||||||
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
|
||||||
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
|
||||||
backend_ids_changed = true;
|
backend_ids_changed = true;
|
||||||
|
@ -1722,7 +1727,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!backend_ids_changed) {
|
if (!backend_ids_changed) {
|
||||||
for (int i = 0; i < sched->graph->n_leafs; i++) {
|
for (int i = 0; i < sched->graph.n_leafs; i++) {
|
||||||
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
|
||||||
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
|
||||||
backend_ids_changed = true;
|
backend_ids_changed = true;
|
||||||
|
@ -1732,14 +1737,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate graph
|
// allocate graph
|
||||||
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||||
// the re-allocation may cause the split inputs to be moved to a different address
|
// the re-allocation may cause the split inputs to be moved to a different address
|
||||||
ggml_backend_sched_synchronize(sched);
|
ggml_backend_sched_synchronize(sched);
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
|
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
||||||
#endif
|
#endif
|
||||||
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
||||||
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
|
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||||
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1760,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
|
struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
|
||||||
|
|
||||||
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (input->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
|
||||||
|
@ -1846,21 +1851,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
||||||
|
|
||||||
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
|
||||||
|
sched->n_backends = n_backends;
|
||||||
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
// FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
|
||||||
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||||
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
||||||
|
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
||||||
|
|
||||||
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
||||||
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
||||||
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
|
||||||
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
|
||||||
|
sched->context_buffer = malloc(sched->context_buffer_size);
|
||||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
|
||||||
|
|
||||||
const int initial_splits_capacity = 16;
|
const int initial_splits_capacity = 16;
|
||||||
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
||||||
|
@ -1895,37 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
ggml_gallocr_free(sched->galloc);
|
ggml_gallocr_free(sched->galloc);
|
||||||
ggml_free(sched->ctx);
|
ggml_free(sched->ctx);
|
||||||
|
ggml_hash_set_free(&sched->hash_set);
|
||||||
free(sched->splits);
|
free(sched->splits);
|
||||||
free(sched->hash_set.keys);
|
free(sched->hv_tensor_backend_ids);
|
||||||
free(sched->tensor_backend_id);
|
free(sched->hv_tensor_copies);
|
||||||
free(sched->tensor_copies);
|
|
||||||
free(sched->node_backend_ids);
|
free(sched->node_backend_ids);
|
||||||
free(sched->leaf_backend_ids);
|
free(sched->leaf_backend_ids);
|
||||||
free(sched->prev_node_backend_ids);
|
free(sched->prev_node_backend_ids);
|
||||||
free(sched->prev_leaf_backend_ids);
|
free(sched->prev_leaf_backend_ids);
|
||||||
|
free(sched->context_buffer);
|
||||||
|
free(sched->graph.nodes);
|
||||||
|
free(sched->graph.leafs);
|
||||||
free(sched);
|
free(sched);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
// reset state for the next run
|
// reset state for the next run
|
||||||
if (!sched->is_reset) {
|
if (!sched->is_reset) {
|
||||||
size_t hash_size = sched->hash_set.size;
|
ggml_hash_set_reset(&sched->hash_set);
|
||||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
|
memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
|
||||||
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
|
memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
|
||||||
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
|
|
||||||
|
|
||||||
sched->is_reset = true;
|
sched->is_reset = true;
|
||||||
}
|
}
|
||||||
sched->is_alloc = false;
|
sched->is_alloc = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
// TODO: extract this to a separate function
|
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||||
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1936,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, graph);
|
ggml_backend_sched_split_graph(sched, graph);
|
||||||
|
|
||||||
|
|
||||||
if (!ggml_backend_sched_alloc_splits(sched)) {
|
if (!ggml_backend_sched_alloc_splits(sched)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -2009,6 +2017,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
|
||||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||||
tensor_backend_id(node) = backend_index;
|
tensor_backend_id(node) = backend_index;
|
||||||
SET_CAUSE(node, "usr");
|
SET_CAUSE(node, "usr");
|
||||||
|
sched->is_reset = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
|
||||||
|
@ -2051,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
||||||
GGML_ASSERT(src != NULL);
|
GGML_ASSERT(src != NULL);
|
||||||
GGML_ASSERT(src->data && "graph must be allocated");
|
GGML_ASSERT(src->data && "graph must be allocated");
|
||||||
|
|
||||||
size_t id = ggml_hash_insert(hash_set, src);
|
size_t id = ggml_hash_insert(&hash_set, src);
|
||||||
if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
|
if (id == GGML_HASHSET_ALREADY_EXISTS) {
|
||||||
return node_copies[ggml_hash_find(hash_set, src)];
|
return node_copies[ggml_hash_find(&hash_set, src)];
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
|
||||||
|
@ -2078,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
|
||||||
return dst;
|
return dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
|
||||||
size_t id = ggml_hash_find(hash_set, src);
|
size_t id = ggml_hash_find(hash_set, src);
|
||||||
if (node_init[id]) {
|
if (node_init[id]) {
|
||||||
return;
|
return;
|
||||||
|
@ -2105,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
||||||
struct ggml_hash_set hash_set = {
|
struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
|
||||||
/* .size = */ graph->visited_hash_table.size,
|
|
||||||
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
|
||||||
};
|
|
||||||
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
||||||
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
||||||
|
|
||||||
|
@ -2123,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||||
|
|
||||||
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
||||||
fprintf(stderr, "failed to allocate context for graph copy\n");
|
fprintf(stderr, "failed to allocate context for graph copy\n");
|
||||||
free(hash_set.keys);
|
ggml_hash_set_free(&hash_set);
|
||||||
free(node_copies);
|
free(node_copies);
|
||||||
free(node_init);
|
free(node_init);
|
||||||
ggml_free(ctx_allocated);
|
ggml_free(ctx_allocated);
|
||||||
|
@ -2146,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
||||||
free(hash_set.keys);
|
ggml_hash_set_free(&hash_set);
|
||||||
free(node_copies);
|
free(node_copies);
|
||||||
free(node_init);
|
free(node_init);
|
||||||
ggml_free(ctx_allocated);
|
ggml_free(ctx_allocated);
|
||||||
|
@ -2164,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||||
// copy data and init views
|
// copy data and init views
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
graph_copy_init_tensor(hash_set, node_copies, node_init, node);
|
graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
|
||||||
}
|
}
|
||||||
|
|
||||||
// build graph copy
|
// build graph copy
|
||||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
|
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
|
||||||
graph_copy->nodes[i] = node_copy;
|
graph_copy->nodes[i] = node_copy;
|
||||||
}
|
}
|
||||||
graph_copy->n_nodes = graph->n_nodes;
|
graph_copy->n_nodes = graph->n_nodes;
|
||||||
|
|
||||||
free(hash_set.keys);
|
ggml_hash_set_free(&hash_set);
|
||||||
free(node_copies);
|
free(node_copies);
|
||||||
free(node_init);
|
free(node_init);
|
||||||
|
|
||||||
|
|
|
@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
||||||
file, line);
|
file, line);
|
||||||
GGML_CANN_LOG_ERROR(" %s\n", stmt);
|
GGML_CANN_LOG_ERROR(" %s\n", stmt);
|
||||||
// abort with GGML_ASSERT to get a stack trace
|
// abort with GGML_ASSERT to get a stack trace
|
||||||
GGML_ASSERT(!"CANN error");
|
GGML_ABORT("CANN error");
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||||
// memory should always buffered. these memory may still needed by
|
// memory should always buffered. these memory may still needed by
|
||||||
// tasks in stream.
|
// tasks in stream.
|
||||||
// TODO, fix me.
|
// TODO, fix me.
|
||||||
GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
|
GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1559,23 +1559,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// need open both directions for memcpyasync between devices.
|
||||||
|
ggml_cann_set_device(cann_ctx_dst->device);
|
||||||
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
|
||||||
ggml_cann_set_device(cann_ctx_src->device);
|
ggml_cann_set_device(cann_ctx_src->device);
|
||||||
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
||||||
|
|
||||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||||
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||||
cann_ctx_dst->stream()));
|
cann_ctx_src->stream()));
|
||||||
|
|
||||||
// record event on src stream
|
//TODO: workaround for Event didn`t work here.
|
||||||
if (!cann_ctx_src->copy_event) {
|
aclrtSynchronizeStream(cann_ctx_src->stream());
|
||||||
ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event));
|
|
||||||
}
|
|
||||||
|
|
||||||
ACL_CHECK(
|
|
||||||
aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
|
|
||||||
|
|
||||||
// wait on dst stream for the copy to complete
|
|
||||||
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(),
|
|
||||||
cann_ctx_src->copy_event));
|
|
||||||
} else {
|
} else {
|
||||||
// src and dst are on the same backend
|
// src and dst are on the same backend
|
||||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
||||||
|
@ -1763,8 +1758,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
||||||
*
|
*
|
||||||
* This function determines whether the CANN backend supports the given backend
|
* This function determines whether the CANN backend supports the given backend
|
||||||
* buffer type by comparing the device context of the backend and buffer type.
|
* buffer type by comparing the device context of the backend and buffer type.
|
||||||
* It returns true if the device associated with the buffer type matches the
|
* It returns true if the devices are same between the backend context and
|
||||||
* device associated with the backend.
|
* buffer type context.
|
||||||
*
|
*
|
||||||
* @param backend Pointer to the CANN backend.
|
* @param backend Pointer to the CANN backend.
|
||||||
* @param buft Pointer to the backend buffer type to check.
|
* @param buft Pointer to the backend buffer type to check.
|
||||||
|
@ -1773,9 +1768,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
||||||
*/
|
*/
|
||||||
GGML_CALL static bool ggml_backend_cann_supports_buft(
|
GGML_CALL static bool ggml_backend_cann_supports_buft(
|
||||||
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
|
if (ggml_backend_buft_is_cann(buft)) {
|
||||||
|
ggml_backend_cann_context * cann_ctx =
|
||||||
GGML_UNUSED(backend);
|
(ggml_backend_cann_context *)backend->context;
|
||||||
|
ggml_backend_cann_buffer_type_context * buft_ctx =
|
||||||
|
(ggml_backend_cann_buffer_type_context *)buft->context;
|
||||||
|
return buft_ctx->device == cann_ctx->device;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1874,7 +1874,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
||||||
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
|
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
|
||||||
(aclrtEvent)event->context));
|
(aclrtEvent)event->context));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -844,7 +844,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ggml_cann_max_pool2d(ctx, dst);
|
ggml_cann_max_pool2d(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_POOL_COUNT:
|
case GGML_OP_POOL_COUNT:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -931,9 +931,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
((ggml_tensor*)dst->extra)->nb);
|
((ggml_tensor*)dst->extra)->nb);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
if (dst->type == GGML_TYPE_F32) {
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
if (ggml_are_same_shape(src, dst)) {
|
if (ggml_are_same_shape(src, dst)) {
|
||||||
|
@ -955,12 +955,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
((ggml_tensor*)dst->extra)->nb);
|
((ggml_tensor*)dst->extra)->nb);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
// TODO
|
// TODO
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
} else if (src->type == GGML_TYPE_F32) {
|
} else if (src->type == GGML_TYPE_F32) {
|
||||||
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
|
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
|
||||||
// && nb0 == type_size)
|
// && nb0 == type_size)
|
||||||
|
@ -991,10 +991,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
((ggml_tensor*)dst->extra)->nb);
|
((ggml_tensor*)dst->extra)->nb);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
} else {
|
} else {
|
||||||
// TODO: dst not contiguous
|
// TODO: dst not contiguous
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dst->type == GGML_TYPE_F16) {
|
if (dst->type == GGML_TYPE_F16) {
|
||||||
|
@ -1017,11 +1017,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
((ggml_tensor*)dst->extra)->nb);
|
((ggml_tensor*)dst->extra)->nb);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// TODO
|
// TODO
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
} else {
|
} else {
|
||||||
if (ggml_are_same_shape(src, dst)) {
|
if (ggml_are_same_shape(src, dst)) {
|
||||||
cann_copy(ctx, acl_src, acl_dst);
|
cann_copy(ctx, acl_src, acl_dst);
|
||||||
|
@ -1029,7 +1029,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1312,6 +1312,111 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx,
|
||||||
|
ggml_tensor* dst,
|
||||||
|
ggml_tensor* src1,
|
||||||
|
aclTensor* tmp_cast_tensor,
|
||||||
|
aclTensor* tmp_im2col_tensor) {
|
||||||
|
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
||||||
|
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
|
||||||
|
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
|
||||||
|
aclTensor* acl_dst =
|
||||||
|
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
||||||
|
|
||||||
|
int64_t permute_dim[] = {0, 2, 1};
|
||||||
|
if (src1->type != dst->type) {
|
||||||
|
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
||||||
|
} else {
|
||||||
|
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// release
|
||||||
|
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cann_im2col_1d_post_process(
|
||||||
|
ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1,
|
||||||
|
aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor,
|
||||||
|
const std::vector<int64_t>& im2col_op_params) {
|
||||||
|
// get params
|
||||||
|
const int64_t KH = im2col_op_params[0];
|
||||||
|
const int64_t KW = im2col_op_params[1];
|
||||||
|
const int64_t IW = im2col_op_params[2];
|
||||||
|
const int64_t IC = im2col_op_params[3];
|
||||||
|
const int64_t N = im2col_op_params[4];
|
||||||
|
const int64_t OH = im2col_op_params[5];
|
||||||
|
const int64_t OW = im2col_op_params[6];
|
||||||
|
const int64_t s0 = im2col_op_params[7];
|
||||||
|
const int64_t p0 = im2col_op_params[8];
|
||||||
|
const int64_t d0 = im2col_op_params[9];
|
||||||
|
const int64_t n_bytes_factor = im2col_op_params[10];
|
||||||
|
|
||||||
|
// Permute: [N, IC * KH * KW, OW * OH] ->
|
||||||
|
// [N, OW * OH * n_bytes_factor, IC * KH * KW]
|
||||||
|
aclTensor* tmp_permute_tensor = nullptr;
|
||||||
|
ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
|
||||||
|
tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
||||||
|
void* tmp_permute_buffer = tmp_permute_allocator.get();
|
||||||
|
|
||||||
|
int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
|
||||||
|
size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
|
||||||
|
tmp_permute_nb[0] = ggml_type_size(dst->type);
|
||||||
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
||||||
|
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp_permute_tensor = ggml_cann_create_tensor(
|
||||||
|
tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
|
||||||
|
ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb,
|
||||||
|
GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
||||||
|
|
||||||
|
int64_t permute_dim[] = {0, 2, 1};
|
||||||
|
if (src1->type != dst->type) {
|
||||||
|
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
|
||||||
|
} else {
|
||||||
|
aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
|
||||||
|
3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// number of times the kernel moves in W dimension
|
||||||
|
const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
|
||||||
|
size_t offset;
|
||||||
|
void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
|
||||||
|
|
||||||
|
// memory copy with offset to restore 1D im2col from 2d
|
||||||
|
if (IC > 1) {
|
||||||
|
offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
|
||||||
|
size_t size_cpy = KH * KW * ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
for (int c = 0; c < IC; c++) {
|
||||||
|
cur_permute_buffer = (char*)tmp_permute_buffer + offset +
|
||||||
|
KH * KW * c * ggml_type_size(dst->type);
|
||||||
|
cur_dst_buffer = (char*)dst->data +
|
||||||
|
c * KH * KW * n_step_w * ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_step_w; i++) {
|
||||||
|
ACL_CHECK(aclrtMemcpyAsync(
|
||||||
|
cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy,
|
||||||
|
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||||
|
cur_dst_buffer =
|
||||||
|
(char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
|
||||||
|
cur_permute_buffer = (char*)cur_permute_buffer +
|
||||||
|
KH * KW * IC * ggml_type_size(dst->type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
offset = KH * KW * n_step_w *
|
||||||
|
ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
|
||||||
|
ACL_CHECK(aclrtMemcpyAsync(dst->data, offset,
|
||||||
|
(char*)tmp_permute_buffer + offset, offset,
|
||||||
|
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// release
|
||||||
|
ACL_CHECK(aclDestroyTensor(tmp_permute_tensor));
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ggml_tensor* src0 = dst->src[0]; // kernel
|
ggml_tensor* src0 = dst->src[0]; // kernel
|
||||||
ggml_tensor* src1 = dst->src[1]; // input
|
ggml_tensor* src1 = dst->src[1]; // input
|
||||||
|
@ -1320,21 +1425,23 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
|
||||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
|
||||||
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
|
||||||
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
|
||||||
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
|
||||||
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
|
||||||
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
|
||||||
|
|
||||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||||
|
|
||||||
const int64_t N = is_2D ? ne13 : ne12;
|
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
||||||
const int64_t IC = is_2D ? ne12 : ne11;
|
// im2col and do post-processing to restore it to 1D.
|
||||||
|
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
||||||
|
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||||
|
const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1;
|
||||||
|
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
||||||
|
const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1;
|
||||||
|
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
||||||
|
const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1;
|
||||||
|
|
||||||
const int64_t KH = is_2D ? ne01 : 1;
|
const int64_t N = ne13;
|
||||||
|
const int64_t IC = ne12;
|
||||||
|
const int64_t KH = ne01;
|
||||||
const int64_t KW = ne00;
|
const int64_t KW = ne00;
|
||||||
|
const int64_t IW = ne10;
|
||||||
|
|
||||||
const int64_t OH = is_2D ? ne2 : 1;
|
const int64_t OH = is_2D ? ne2 : 1;
|
||||||
const int64_t OW = ne1;
|
const int64_t OW = ne1;
|
||||||
|
@ -1342,9 +1449,12 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||||
GGML_ASSERT(nb10 == sizeof(float));
|
GGML_ASSERT(nb10 == sizeof(float));
|
||||||
|
|
||||||
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
|
// memory allocated increased to 3x when is_2D == false
|
||||||
|
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
||||||
|
|
||||||
|
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
|
||||||
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
||||||
int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
|
int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N};
|
||||||
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
|
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
|
||||||
|
|
||||||
tmp_im2col_nb[0] = ggml_type_size(src1->type);
|
tmp_im2col_nb[0] = ggml_type_size(src1->type);
|
||||||
|
@ -1356,8 +1466,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
|
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
|
||||||
// dst.elemcount.
|
// dst.elemcount.
|
||||||
ggml_cann_pool_alloc im2col_allocator(
|
ggml_cann_pool_alloc im2col_allocator(
|
||||||
ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
|
ctx.pool(),
|
||||||
|
ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
|
||||||
void* tmp_im2col_buffer = im2col_allocator.get();
|
void* tmp_im2col_buffer = im2col_allocator.get();
|
||||||
|
|
||||||
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
|
aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor(
|
||||||
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
|
tmp_im2col_buffer, ggml_cann_type_mapping(src1->type),
|
||||||
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
|
ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb,
|
||||||
|
@ -1380,8 +1492,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
paddings, strides, tmp_im2col_tensor,
|
paddings, strides, tmp_im2col_tensor,
|
||||||
&workspaceSize, &executor));
|
&workspaceSize, &executor));
|
||||||
|
|
||||||
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
||||||
if (workspaceSize > 0) {
|
if (workspaceSize > 0) {
|
||||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
workspace_allocator.alloc(workspaceSize);
|
||||||
workspaceAddr = workspace_allocator.get();
|
workspaceAddr = workspace_allocator.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1391,9 +1504,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
// Cast if dst is f16.
|
// Cast if dst is f16.
|
||||||
aclTensor* tmp_cast_tensor = nullptr;
|
aclTensor* tmp_cast_tensor = nullptr;
|
||||||
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
|
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
|
||||||
|
void* tmp_cast_buffer = nullptr;
|
||||||
if (src1->type != dst->type) {
|
if (src1->type != dst->type) {
|
||||||
tmp_cast_allocator.alloc(ggml_nbytes(dst));
|
tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
||||||
void* tmp_cast_buffer = tmp_cast_allocator.get();
|
tmp_cast_buffer = tmp_cast_allocator.get();
|
||||||
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
|
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
|
||||||
temp_cast_nb[0] = ggml_type_size(dst->type);
|
temp_cast_nb[0] = ggml_type_size(dst->type);
|
||||||
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
||||||
|
@ -1408,24 +1522,21 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ggml_cann_type_mapping(dst->type));
|
ggml_cann_type_mapping(dst->type));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
// post-processing
|
||||||
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
|
if (is_2D) {
|
||||||
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
|
ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
||||||
aclTensor* acl_dst =
|
tmp_im2col_tensor);
|
||||||
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
|
||||||
|
|
||||||
int64_t permute_dim[] = {0, 2, 1};
|
|
||||||
if (src1->type != dst->type) {
|
|
||||||
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
|
||||||
} else {
|
} else {
|
||||||
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
std::vector<int64_t> im2col_op_params = {
|
||||||
|
KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor};
|
||||||
|
ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
||||||
|
tmp_im2col_tensor, im2col_op_params);
|
||||||
}
|
}
|
||||||
|
|
||||||
// release
|
// release
|
||||||
ACL_CHECK(aclDestroyTensor(acl_src1));
|
ACL_CHECK(aclDestroyTensor(acl_src1));
|
||||||
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
|
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
|
||||||
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
|
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
|
||||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
|
||||||
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
ACL_CHECK(aclDestroyIntArray(kernel_size));
|
||||||
ACL_CHECK(aclDestroyIntArray(dilations));
|
ACL_CHECK(aclDestroyIntArray(dilations));
|
||||||
ACL_CHECK(aclDestroyIntArray(paddings));
|
ACL_CHECK(aclDestroyIntArray(paddings));
|
||||||
|
@ -2219,7 +2330,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
((ggml_tensor*)dst->extra)->nb);
|
((ggml_tensor*)dst->extra)->nb);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2381,10 +2492,10 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
|
||||||
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
|
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
|
||||||
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
|
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
|
||||||
|
|
||||||
|
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
||||||
if (src1->type != GGML_TYPE_F16) {
|
if (src1->type != GGML_TYPE_F16) {
|
||||||
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
|
||||||
ggml_cann_pool_alloc input_alloctor(
|
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
||||||
ctx.pool(), ggml_nelements(src1) * input_elem_size);
|
|
||||||
input_buffer = input_alloctor.get();
|
input_buffer = input_alloctor.get();
|
||||||
|
|
||||||
int64_t* input_cast_ne = src1->ne;
|
int64_t* input_cast_ne = src1->ne;
|
||||||
|
@ -2492,7 +2603,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||||
ggml_cann_mul_mat_q8_0(ctx, dst);
|
ggml_cann_mul_mat_q8_0(ctx, dst);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,11 @@ typedef half2 ggml_half2;
|
||||||
|
|
||||||
#define GGML_COMMON_DECL
|
#define GGML_COMMON_DECL
|
||||||
#elif defined(GGML_COMMON_DECL_CUDA)
|
#elif defined(GGML_COMMON_DECL_CUDA)
|
||||||
|
#if defined(GGML_COMMON_DECL_MUSA)
|
||||||
|
#include <musa_fp16.h>
|
||||||
|
#else
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
typedef half ggml_half;
|
typedef half ggml_half;
|
||||||
|
@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
|
||||||
#define GGML_TABLE_END() };
|
#define GGML_TABLE_END() };
|
||||||
|
|
||||||
#define GGML_COMMON_IMPL
|
#define GGML_COMMON_IMPL
|
||||||
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
|
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
|
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
|
||||||
|
|
|
@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||||
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
||||||
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
|
||||||
// abort with GGML_ASSERT to get a stack trace
|
// abort with GGML_ASSERT to get a stack trace
|
||||||
GGML_ASSERT(!"CUDA error");
|
GGML_ABORT("CUDA error");
|
||||||
}
|
}
|
||||||
|
|
||||||
// this is faster on Windows
|
// this is faster on Windows
|
||||||
|
@ -130,7 +130,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
|
cudaError_t err;
|
||||||
|
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||||
|
{
|
||||||
|
err = cudaMallocManaged(ptr, size);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
err = cudaMalloc(ptr, size);
|
||||||
|
}
|
||||||
|
return err;
|
||||||
|
#else
|
||||||
return cudaMalloc(ptr, size);
|
return cudaMalloc(ptr, size);
|
||||||
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,7 +182,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CU_CHECK(cuDeviceGet(&device, id));
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
|
@ -179,7 +194,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
alloc_prop.location.id = id;
|
alloc_prop.location.id = id;
|
||||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIPBLAS)
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
info.devices[id].vmm = !!device_vmm;
|
info.devices[id].vmm = !!device_vmm;
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
@ -315,7 +330,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||||
};
|
};
|
||||||
|
|
||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||||
|
|
||||||
|
@ -409,14 +424,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // !defined(GGML_USE_HIPBLAS)
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
|
|
||||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
if (ggml_cuda_info().devices[device].vmm) {
|
if (ggml_cuda_info().devices[device].vmm) {
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||||
}
|
}
|
||||||
#endif
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1341,7 +1356,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||||
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
||||||
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
||||||
cudaMemcpy3DPeerParms p = {};
|
cudaMemcpy3DPeerParms p = {};
|
||||||
p.dstDevice = dstDevice;
|
p.dstDevice = dstDevice;
|
||||||
|
@ -1355,7 +1370,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
||||||
GGML_UNUSED(dstDevice);
|
GGML_UNUSED(dstDevice);
|
||||||
GGML_UNUSED(srcDevice);
|
GGML_UNUSED(srcDevice);
|
||||||
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
||||||
#endif // !defined(GGML_USE_HIPBLAS)
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_op_mul_mat(
|
static void ggml_cuda_op_mul_mat(
|
||||||
|
@ -1596,7 +1611,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
||||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (quantize_src1 && !src1_is_contiguous) {
|
if (quantize_src1 && !src1_is_contiguous) {
|
||||||
|
@ -1828,6 +1843,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
#ifdef GGML_USE_MUSA
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
#else // !GGML_USE_MUSA
|
||||||
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
// use cublasGemmStridedBatchedEx
|
// use cublasGemmStridedBatchedEx
|
||||||
|
@ -1870,6 +1888,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
cu_compute_type,
|
cu_compute_type,
|
||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
}
|
}
|
||||||
|
#endif // GGML_USE_MUSA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
if (dst->op_params[0] == GGML_PREC_DEFAULT) {
|
||||||
|
@ -1881,10 +1900,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||||
|
|
||||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
|
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
|
||||||
&& src1->ne[1] == 1;
|
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
|
@ -2945,7 +2963,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||||
|
|
||||||
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
|
||||||
#endif
|
#endif
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3027,7 +3045,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if CUDART_VERSION >= 11100
|
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
||||||
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
|
|
|
@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
|
||||||
} else if (order == GGML_SORT_ORDER_DESC) {
|
} else if (order == GGML_SORT_ORDER_DESC) {
|
||||||
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast(
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,10 @@
|
||||||
#else
|
#else
|
||||||
#define GGML_COMMON_DECL_CUDA
|
#define GGML_COMMON_DECL_CUDA
|
||||||
#define GGML_COMMON_IMPL_CUDA
|
#define GGML_COMMON_IMPL_CUDA
|
||||||
|
#if defined(GGML_USE_MUSA)
|
||||||
|
#define GGML_COMMON_DECL_MUSA
|
||||||
|
#define GGML_COMMON_IMPL_MUSA
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
|
@ -23,111 +27,11 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#include <hip/hip_runtime.h>
|
#include "vendors/hip.h"
|
||||||
#include <hipblas/hipblas.h>
|
#elif defined(GGML_USE_MUSA)
|
||||||
#include <hip/hip_fp16.h>
|
#include "vendors/musa.h"
|
||||||
#ifdef __HIP_PLATFORM_AMD__
|
|
||||||
// for rocblas_initialize()
|
|
||||||
#include "rocblas/rocblas.h"
|
|
||||||
#endif // __HIP_PLATFORM_AMD__
|
|
||||||
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
|
||||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
|
||||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
|
||||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
|
||||||
#define CUBLAS_OP_N HIPBLAS_OP_N
|
|
||||||
#define CUBLAS_OP_T HIPBLAS_OP_T
|
|
||||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
|
||||||
#define CUDA_R_16F HIPBLAS_R_16F
|
|
||||||
#define CUDA_R_32F HIPBLAS_R_32F
|
|
||||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
|
||||||
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
|
||||||
#define cublasCreate hipblasCreate
|
|
||||||
#define cublasDestroy hipblasDestroy
|
|
||||||
#define cublasGemmEx hipblasGemmEx
|
|
||||||
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
|
||||||
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
|
||||||
#define cublasHandle_t hipblasHandle_t
|
|
||||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
|
||||||
#define cublasSetStream hipblasSetStream
|
|
||||||
#define cublasSgemm hipblasSgemm
|
|
||||||
#define cublasStatus_t hipblasStatus_t
|
|
||||||
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
|
||||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
|
||||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
|
||||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
|
||||||
#define cudaDeviceProp hipDeviceProp_t
|
|
||||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
|
||||||
#define cudaError_t hipError_t
|
|
||||||
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
|
||||||
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
|
||||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
|
||||||
#define cudaEventDisableTiming hipEventDisableTiming
|
|
||||||
#define cudaEventRecord hipEventRecord
|
|
||||||
#define cudaEventSynchronize hipEventSynchronize
|
|
||||||
#define cudaEvent_t hipEvent_t
|
|
||||||
#define cudaEventDestroy hipEventDestroy
|
|
||||||
#define cudaFree hipFree
|
|
||||||
#define cudaFreeHost hipHostFree
|
|
||||||
#define cudaGetDevice hipGetDevice
|
|
||||||
#define cudaGetDeviceCount hipGetDeviceCount
|
|
||||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
|
||||||
#define cudaGetErrorString hipGetErrorString
|
|
||||||
#define cudaGetLastError hipGetLastError
|
|
||||||
#define cudaHostRegister hipHostRegister
|
|
||||||
#define cudaHostRegisterPortable hipHostRegisterPortable
|
|
||||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
|
||||||
#define cudaHostUnregister hipHostUnregister
|
|
||||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
|
||||||
#define cudaMalloc hipMalloc
|
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
|
||||||
#define cudaMemcpy hipMemcpy
|
|
||||||
#define cudaMemcpyAsync hipMemcpyAsync
|
|
||||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
|
||||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
|
||||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
|
||||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
|
||||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
|
||||||
#define cudaMemcpyKind hipMemcpyKind
|
|
||||||
#define cudaMemset hipMemset
|
|
||||||
#define cudaMemsetAsync hipMemsetAsync
|
|
||||||
#define cudaMemGetInfo hipMemGetInfo
|
|
||||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
|
||||||
#define cudaSetDevice hipSetDevice
|
|
||||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
||||||
#define cudaStreamDestroy hipStreamDestroy
|
|
||||||
#define cudaStreamFireAndForget hipStreamFireAndForget
|
|
||||||
#define cudaStreamNonBlocking hipStreamNonBlocking
|
|
||||||
#define cudaStreamPerThread hipStreamPerThread
|
|
||||||
#define cudaStreamSynchronize hipStreamSynchronize
|
|
||||||
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
|
||||||
#define cudaStream_t hipStream_t
|
|
||||||
#define cudaSuccess hipSuccess
|
|
||||||
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
|
||||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
||||||
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
|
||||||
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
|
||||||
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
|
||||||
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
|
||||||
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
|
||||||
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
|
||||||
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
|
||||||
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
|
||||||
#else
|
#else
|
||||||
#include <cuda_runtime.h>
|
#include "vendors/cuda.h"
|
||||||
#include <cuda.h>
|
|
||||||
#include <cublas_v2.h>
|
|
||||||
#include <cuda_fp16.h>
|
|
||||||
|
|
||||||
#if CUDART_VERSION < 11020
|
|
||||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
|
||||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
|
||||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
|
||||||
#define cublasComputeType_t cudaDataType_t
|
|
||||||
#endif // CUDART_VERSION < 11020
|
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||||
|
@ -168,7 +72,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||||
|
|
||||||
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
|
||||||
|
|
||||||
#if CUDART_VERSION >= 12000
|
#if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
|
||||||
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
static const char * cublas_get_error_str(const cublasStatus_t err) {
|
||||||
return cublasGetStatusString(err);
|
return cublasGetStatusString(err);
|
||||||
}
|
}
|
||||||
|
@ -200,7 +104,7 @@ static const char * cu_get_error_str(CUresult err) {
|
||||||
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if CUDART_VERSION >= 11100
|
#if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
|
||||||
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
|
||||||
#else
|
#else
|
||||||
#define GGML_CUDA_ASSUME(x)
|
#define GGML_CUDA_ASSUME(x)
|
||||||
|
@ -212,93 +116,7 @@ typedef half2 dfloat2;
|
||||||
#else
|
#else
|
||||||
typedef float dfloat; // dequantize float
|
typedef float dfloat; // dequantize float
|
||||||
typedef float2 dfloat2;
|
typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
|
||||||
#define __CUDA_ARCH__ 1300
|
|
||||||
|
|
||||||
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
|
||||||
defined(__gfx1150__) || defined(__gfx1151__)
|
|
||||||
#define RDNA3
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
|
||||||
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
|
||||||
#define RDNA2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__gfx1010__) || defined(__gfx1012__)
|
|
||||||
#define RDNA1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef __has_builtin
|
|
||||||
#define __has_builtin(x) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
|
||||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
|
||||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
||||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
|
||||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
|
||||||
#if __has_builtin(__builtin_elementwise_sub_sat)
|
|
||||||
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
|
||||||
return reinterpret_cast<const int &>(c);
|
|
||||||
#else
|
|
||||||
int8x4_t c;
|
|
||||||
int16_t tmp;
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
tmp = va[i] - vb[i];
|
|
||||||
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
|
||||||
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
|
||||||
c[i] = tmp;
|
|
||||||
}
|
|
||||||
return reinterpret_cast<int &>(c);
|
|
||||||
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
|
||||||
return __vsubss4(a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
|
||||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
|
||||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
|
||||||
unsigned int c;
|
|
||||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i = 0; i < 4; ++i) {
|
|
||||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
|
||||||
// __shfl_xor() for half2 was added in ROCm 5.6
|
|
||||||
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
|
||||||
typedef union half2_b32 {
|
|
||||||
half2 val;
|
|
||||||
int b32;
|
|
||||||
} half2_b32_t;
|
|
||||||
half2_b32_t tmp;
|
|
||||||
tmp.val = var;
|
|
||||||
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
|
||||||
return tmp.val;
|
|
||||||
}
|
|
||||||
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
|
||||||
|
|
||||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
#define FP16_AVAILABLE
|
#define FP16_AVAILABLE
|
||||||
|
@ -348,7 +166,7 @@ static __device__ void no_device_code(
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
||||||
#else
|
#else
|
||||||
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
||||||
#endif // __CUDA_ARCH__
|
#endif // __CUDA_ARCH__
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||||
|
@ -455,11 +273,11 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||||
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
||||||
return mask_low | mask_high;
|
return mask_low | mask_high;
|
||||||
}
|
}
|
||||||
#endif // CUDART_VERSION < 12000
|
#endif // CUDART_VERSION < CUDART_HMASK
|
||||||
|
|
||||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
||||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||||
#elif defined(RDNA3)
|
#elif defined(RDNA3)
|
||||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||||
|
|
|
@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -500,7 +500,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||||
const dim3 block_nums(block_num_y, 1, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
|
@ -510,7 +510,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(block_num_y, 1, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
@ -519,7 +519,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(block_num_y, 1, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
@ -528,7 +528,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(block_num_y, 1, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
@ -537,7 +537,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(block_num_y, 1, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
@ -588,7 +588,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
||||||
}
|
}
|
||||||
|
|
||||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(block_num_y, 1, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -672,3 +672,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
GGML_UNUSED(src1_ncols);
|
GGML_UNUSED(src1_ncols);
|
||||||
GGML_UNUSED(src1_padded_row_size);
|
GGML_UNUSED(src1_padded_row_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
|
||||||
|
return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
|
||||||
|
src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
|
||||||
|
src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
|
||||||
|
src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
|
||||||
|
src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
|
||||||
|
src0_type == GGML_TYPE_F16;
|
||||||
|
}
|
||||||
|
|
|
@ -16,3 +16,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
const int64_t src1_padded_row_size, cudaStream_t stream);
|
||||||
|
|
||||||
|
bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
|
||||||
|
|
|
@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) {
|
||||||
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
|
||||||
fprintf(stderr, "By default only f16 KV cache is supported.\n");
|
fprintf(stderr, "By default only f16 KV cache is supported.\n");
|
||||||
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
|
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
} else if (D == 128) {
|
} else if (D == 128) {
|
||||||
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
|
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
|
||||||
fprintf(stderr, "Supported combinations:\n");
|
fprintf(stderr, "Supported combinations:\n");
|
||||||
|
@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) {
|
||||||
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
|
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
|
||||||
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
|
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
|
||||||
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
|
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
|
||||||
fprintf(stderr, "Only f16 is supported.\n");
|
fprintf(stderr, "Only f16 is supported.\n");
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||||
} break;
|
} break;
|
||||||
default: {
|
default: {
|
||||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||||
} break;
|
} break;
|
||||||
default: {
|
default: {
|
||||||
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
|
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||||
// ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
|
// ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
|
||||||
// break;
|
// break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
|
@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
|
@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||||
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// TODO: k-quants
|
// TODO: k-quants
|
||||||
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
|
||||||
GGML_ASSERT(false);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,6 +59,24 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_IQ2_XXS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_XXS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_XS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ2_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ2_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ3_XXS>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ3_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ3_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
mul_mat_q_case<GGML_TYPE_IQ1_S>(ctx, args, stream);
|
||||||
|
break;
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
mul_mat_q_case<GGML_TYPE_IQ4_XS>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
|
@ -66,7 +84,7 @@ void ggml_cuda_op_mul_mat_q(
|
||||||
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
|
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -93,6 +111,12 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
case GGML_TYPE_IQ2_XXS:
|
||||||
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
case GGML_TYPE_IQ2_S:
|
||||||
|
case GGML_TYPE_IQ3_XXS:
|
||||||
|
case GGML_TYPE_IQ3_S:
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
mmq_supported = true;
|
mmq_supported = true;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -162,7 +162,7 @@ static void mul_mat_vec_q_cuda(
|
||||||
rows_per_cuda_block = 2;
|
rows_per_cuda_block = 2;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -196,7 +196,7 @@ static void mul_mat_vec_q_cuda(
|
||||||
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -413,7 +413,7 @@ void ggml_cuda_op_mul_mat_vec_q(
|
||||||
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda(
|
||||||
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
attn_factor, corr_dims, freq_factors, stream
|
attn_factor, corr_dims, freq_factors, stream
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
attn_factor, corr_dims, freq_factors, stream
|
attn_factor, corr_dims, freq_factors, stream
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
|
||||||
TYPES_MMQ = [
|
TYPES_MMQ = [
|
||||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||||
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
||||||
"GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
|
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
|
||||||
|
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
|
||||||
]
|
]
|
||||||
|
|
||||||
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ1_S);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_S);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_XS);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ2_XXS);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ3_S);
|
|
@ -0,0 +1,5 @@
|
||||||
|
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||||
|
|
||||||
|
#include "../mmq.cuh"
|
||||||
|
|
||||||
|
DECL_MMQ_CASE(GGML_TYPE_IQ3_XXS);
|
|
@ -188,6 +188,27 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
||||||
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
|
||||||
|
const int * v, const int * u, const float * d8_0, const float & d8_1) {
|
||||||
|
|
||||||
|
float sumf = 0.0f;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
|
||||||
|
int sumi = 0;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = i0; i < i0 + QI8_0/2; ++i) {
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf += d8_0[i0/(QI8_0/2)]*sumi;
|
||||||
|
}
|
||||||
|
|
||||||
|
return d8_1*sumf;
|
||||||
|
}
|
||||||
|
|
||||||
#define VDR_Q2_K_Q8_1_MMVQ 1
|
#define VDR_Q2_K_Q8_1_MMVQ 1
|
||||||
#define VDR_Q2_K_Q8_1_MMQ 4
|
#define VDR_Q2_K_Q8_1_MMQ 4
|
||||||
|
|
||||||
|
|
14
ggml/src/ggml-cuda/vendors/cuda.h
vendored
Normal file
14
ggml/src/ggml-cuda/vendors/cuda.h
vendored
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cublas_v2.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
#if CUDART_VERSION < 11020
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||||
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
|
#define cublasComputeType_t cudaDataType_t
|
||||||
|
#endif // CUDART_VERSION < 11020
|
177
ggml/src/ggml-cuda/vendors/hip.h
vendored
Normal file
177
ggml/src/ggml-cuda/vendors/hip.h
vendored
Normal file
|
@ -0,0 +1,177 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <hip/hip_runtime.h>
|
||||||
|
#include <hipblas/hipblas.h>
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
|
// for rocblas_initialize()
|
||||||
|
#include "rocblas/rocblas.h"
|
||||||
|
#endif // __HIP_PLATFORM_AMD__
|
||||||
|
#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||||
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
||||||
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||||
|
#define CUDA_R_16F HIPBLAS_R_16F
|
||||||
|
#define CUDA_R_32F HIPBLAS_R_32F
|
||||||
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||||
|
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
|
||||||
|
#define cublasCreate hipblasCreate
|
||||||
|
#define cublasDestroy hipblasDestroy
|
||||||
|
#define cublasGemmEx hipblasGemmEx
|
||||||
|
#define cublasGemmBatchedEx hipblasGemmBatchedEx
|
||||||
|
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
|
||||||
|
#define cublasHandle_t hipblasHandle_t
|
||||||
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||||
|
#define cublasSetStream hipblasSetStream
|
||||||
|
#define cublasSgemm hipblasSgemm
|
||||||
|
#define cublasStatus_t hipblasStatus_t
|
||||||
|
#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
|
||||||
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||||
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||||
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||||
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||||
|
#define cudaError_t hipError_t
|
||||||
|
#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
|
||||||
|
#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
|
||||||
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||||
|
#define cudaEventDisableTiming hipEventDisableTiming
|
||||||
|
#define cudaEventRecord hipEventRecord
|
||||||
|
#define cudaEventSynchronize hipEventSynchronize
|
||||||
|
#define cudaEvent_t hipEvent_t
|
||||||
|
#define cudaEventDestroy hipEventDestroy
|
||||||
|
#define cudaFree hipFree
|
||||||
|
#define cudaFreeHost hipHostFree
|
||||||
|
#define cudaGetDevice hipGetDevice
|
||||||
|
#define cudaGetDeviceCount hipGetDeviceCount
|
||||||
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||||
|
#define cudaGetErrorString hipGetErrorString
|
||||||
|
#define cudaGetLastError hipGetLastError
|
||||||
|
#define cudaHostRegister hipHostRegister
|
||||||
|
#define cudaHostRegisterPortable hipHostRegisterPortable
|
||||||
|
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||||
|
#define cudaHostUnregister hipHostUnregister
|
||||||
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||||
|
#define cudaMalloc hipMalloc
|
||||||
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
|
#define cudaMemcpy hipMemcpy
|
||||||
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||||
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||||
|
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||||
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||||
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||||
|
#define cudaMemcpyKind hipMemcpyKind
|
||||||
|
#define cudaMemset hipMemset
|
||||||
|
#define cudaMemsetAsync hipMemsetAsync
|
||||||
|
#define cudaMemGetInfo hipMemGetInfo
|
||||||
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||||
|
#define cudaSetDevice hipSetDevice
|
||||||
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||||
|
#define cudaStreamDestroy hipStreamDestroy
|
||||||
|
#define cudaStreamFireAndForget hipStreamFireAndForget
|
||||||
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
||||||
|
#define cudaStreamPerThread hipStreamPerThread
|
||||||
|
#define cudaStreamSynchronize hipStreamSynchronize
|
||||||
|
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
|
||||||
|
#define cudaStream_t hipStream_t
|
||||||
|
#define cudaSuccess hipSuccess
|
||||||
|
#define __trap() do { abort(); __builtin_unreachable(); } while(0)
|
||||||
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
|
||||||
|
#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
|
||||||
|
#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
|
||||||
|
#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
|
||||||
|
#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
|
||||||
|
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
|
||||||
|
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
|
||||||
|
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
|
||||||
|
|
||||||
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
||||||
|
defined(__gfx1150__) || defined(__gfx1151__)
|
||||||
|
#define RDNA3
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
|
||||||
|
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
|
||||||
|
#define RDNA2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__gfx1010__) || defined(__gfx1012__)
|
||||||
|
#define RDNA1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __has_builtin
|
||||||
|
#define __has_builtin(x) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||||
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||||
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||||
|
#if __has_builtin(__builtin_elementwise_sub_sat)
|
||||||
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||||
|
return reinterpret_cast<const int &>(c);
|
||||||
|
#else
|
||||||
|
int8x4_t c;
|
||||||
|
int16_t tmp;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
tmp = va[i] - vb[i];
|
||||||
|
if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
|
||||||
|
if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
|
||||||
|
c[i] = tmp;
|
||||||
|
}
|
||||||
|
return reinterpret_cast<int &>(c);
|
||||||
|
#endif // __has_builtin(__builtin_elementwise_sub_sat)
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int __vsub4(const int a, const int b) {
|
||||||
|
return __vsubss4(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
||||||
|
// __shfl_xor() for half2 was added in ROCm 5.6
|
||||||
|
static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
|
||||||
|
typedef union half2_b32 {
|
||||||
|
half2 val;
|
||||||
|
int b32;
|
||||||
|
} half2_b32_t;
|
||||||
|
half2_b32_t tmp;
|
||||||
|
tmp.val = var;
|
||||||
|
tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
|
||||||
|
return tmp.val;
|
||||||
|
}
|
||||||
|
#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
|
171
ggml/src/ggml-cuda/vendors/musa.h
vendored
Normal file
171
ggml/src/ggml-cuda/vendors/musa.h
vendored
Normal file
|
@ -0,0 +1,171 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <musa_runtime.h>
|
||||||
|
#include <musa.h>
|
||||||
|
#include <mublas.h>
|
||||||
|
#include <musa_fp16.h>
|
||||||
|
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||||
|
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||||
|
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
|
||||||
|
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||||
|
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||||
|
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
||||||
|
#define CUDA_R_16F MUSA_R_16F
|
||||||
|
#define CUDA_R_32F MUSA_R_32F
|
||||||
|
#define cublasComputeType_t cudaDataType_t
|
||||||
|
#define cublasCreate mublasCreate
|
||||||
|
#define cublasDestroy mublasDestroy
|
||||||
|
#define cublasGemmEx mublasGemmEx
|
||||||
|
#define cublasGemmBatchedEx mublasGemmBatchedEx
|
||||||
|
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
|
||||||
|
#define cublasHandle_t mublasHandle_t
|
||||||
|
#define cublasSetMathMode mublasSetMathMode
|
||||||
|
#define cublasSetStream mublasSetStream
|
||||||
|
#define cublasSgemm mublasSgemm
|
||||||
|
#define cublasStatus_t mublasStatus_t
|
||||||
|
#define cublasGetStatusString mublasStatus_to_string
|
||||||
|
#define cudaDataType_t musaDataType_t
|
||||||
|
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||||
|
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||||
|
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
|
||||||
|
#define cudaDeviceProp musaDeviceProp
|
||||||
|
#define cudaDeviceSynchronize musaDeviceSynchronize
|
||||||
|
#define cudaError_t musaError_t
|
||||||
|
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
|
||||||
|
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
|
||||||
|
#define cudaEventCreateWithFlags musaEventCreateWithFlags
|
||||||
|
#define cudaEventDisableTiming musaEventDisableTiming
|
||||||
|
#define cudaEventRecord musaEventRecord
|
||||||
|
#define cudaEventSynchronize musaEventSynchronize
|
||||||
|
#define cudaEvent_t musaEvent_t
|
||||||
|
#define cudaEventDestroy musaEventDestroy
|
||||||
|
#define cudaFree musaFree
|
||||||
|
#define cudaFreeHost musaFreeHost
|
||||||
|
#define cudaGetDevice musaGetDevice
|
||||||
|
#define cudaGetDeviceCount musaGetDeviceCount
|
||||||
|
#define cudaGetDeviceProperties musaGetDeviceProperties
|
||||||
|
#define cudaGetErrorString musaGetErrorString
|
||||||
|
#define cudaGetLastError musaGetLastError
|
||||||
|
#define cudaHostRegister musaHostRegister
|
||||||
|
#define cudaHostRegisterPortable musaHostRegisterPortable
|
||||||
|
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
|
||||||
|
#define cudaHostUnregister musaHostUnregister
|
||||||
|
#define cudaLaunchHostFunc musaLaunchHostFunc
|
||||||
|
#define cudaMalloc musaMalloc
|
||||||
|
#define cudaMallocHost musaMallocHost
|
||||||
|
#define cudaMemcpy musaMemcpy
|
||||||
|
#define cudaMemcpyAsync musaMemcpyAsync
|
||||||
|
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
|
||||||
|
#define cudaMemcpy2DAsync musaMemcpy2DAsync
|
||||||
|
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
|
||||||
|
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
|
||||||
|
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
|
||||||
|
#define cudaMemcpyKind musaMemcpyKind
|
||||||
|
#define cudaMemset musaMemset
|
||||||
|
#define cudaMemsetAsync musaMemsetAsync
|
||||||
|
#define cudaMemGetInfo musaMemGetInfo
|
||||||
|
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
|
||||||
|
#define cudaSetDevice musaSetDevice
|
||||||
|
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
|
||||||
|
#define cudaStreamDestroy musaStreamDestroy
|
||||||
|
#define cudaStreamFireAndForget musaStreamFireAndForget
|
||||||
|
#define cudaStreamNonBlocking musaStreamNonBlocking
|
||||||
|
#define cudaStreamPerThread musaStreamPerThread
|
||||||
|
#define cudaStreamSynchronize musaStreamSynchronize
|
||||||
|
#define cudaStreamWaitEvent musaStreamWaitEvent
|
||||||
|
#define cudaStream_t musaStream_t
|
||||||
|
#define cudaSuccess musaSuccess
|
||||||
|
|
||||||
|
// Additional mappings for MUSA virtual memory pool
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||||
|
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
|
||||||
|
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
|
||||||
|
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
|
||||||
|
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
|
||||||
|
#define CUdevice MUdevice
|
||||||
|
#define CUdeviceptr MUdeviceptr
|
||||||
|
#define CUmemAccessDesc MUmemAccessDesc
|
||||||
|
#define CUmemAllocationProp MUmemAllocationProp
|
||||||
|
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
|
||||||
|
#define cuDeviceGet muDeviceGet
|
||||||
|
#define cuDeviceGetAttribute muDeviceGetAttribute
|
||||||
|
#define cuMemAddressFree muMemAddressFree
|
||||||
|
#define cuMemAddressReserve muMemAddressReserve
|
||||||
|
#define cuMemCreate muMemCreate
|
||||||
|
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
|
||||||
|
#define cuMemMap muMemMap
|
||||||
|
#define cuMemRelease muMemRelease
|
||||||
|
#define cuMemSetAccess muMemSetAccess
|
||||||
|
#define cuMemUnmap muMemUnmap
|
||||||
|
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
|
||||||
|
#define cudaFuncSetAttribute musaFuncSetAttribute
|
||||||
|
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
|
||||||
|
#define make_cudaExtent make_musaExtent
|
||||||
|
#define make_cudaPitchedPtr make_musaPitchedPtr
|
||||||
|
|
||||||
|
// Additional mappings for MUSA graphs
|
||||||
|
#define CUDA_SUCCESS MUSA_SUCCESS
|
||||||
|
#define CUresult MUresult
|
||||||
|
#define cuGetErrorString muGetErrorString
|
||||||
|
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
|
||||||
|
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
|
||||||
|
#define cudaGraphDestroy musaGraphDestroy
|
||||||
|
#define cudaGraphExecDestroy musaGraphExecDestroy
|
||||||
|
#define cudaGraphExec_t musaGraphExec_t
|
||||||
|
#define cudaGraphExecUpdate musaGraphExecUpdate
|
||||||
|
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
|
||||||
|
#define cudaGraphGetNodes musaGraphGetNodes
|
||||||
|
#define cudaGraphInstantiate musaGraphInstantiate
|
||||||
|
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
|
||||||
|
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
|
||||||
|
#define cudaGraphLaunch musaGraphLaunch
|
||||||
|
#define cudaGraphNodeGetType musaGraphNodeGetType
|
||||||
|
#define cudaGraphNode_t musaGraphNode_t
|
||||||
|
#define cudaGraphNodeType musaGraphNodeType
|
||||||
|
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
|
||||||
|
#define cudaGraph_t musaGraph_t
|
||||||
|
#define cudaKernelNodeParams musaKernelNodeParams
|
||||||
|
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||||
|
#define cudaStreamEndCapture musaStreamEndCapture
|
||||||
|
|
||||||
|
// XXX: Clang builtins mapping
|
||||||
|
#define __vsub4 __vsub4_musa
|
||||||
|
#define __vcmpeq4 __vcmpeq4_musa
|
||||||
|
#define __vcmpne4 __vcmpne4_musa
|
||||||
|
|
||||||
|
#ifndef __has_builtin
|
||||||
|
#define __has_builtin(x) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||||
|
return __vsubss4(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||||
|
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||||
|
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||||
|
unsigned int c;
|
||||||
|
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
}
|
|
@ -80,8 +80,9 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||||
/**
|
/**
|
||||||
* Converts float32 to brain16.
|
* Converts float32 to brain16.
|
||||||
*
|
*
|
||||||
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
* This is binary identical with Google Brain float conversion.
|
||||||
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||||
|
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||||
* This code should vectorize nicely if using modern compilers.
|
* This code should vectorize nicely if using modern compilers.
|
||||||
*/
|
*/
|
||||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||||
|
@ -95,10 +96,6 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
if (!(u.i & 0x7f800000)) { /* subnormal */
|
|
||||||
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
|
@ -146,6 +143,7 @@ extern "C" {
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
#include <arm_sve.h>
|
#include <arm_sve.h>
|
||||||
|
#include <sys/prctl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
|
@ -634,21 +632,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
// bitset
|
||||||
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
|
||||||
|
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||||
|
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||||
|
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||||
|
|
||||||
|
static size_t ggml_bitset_size(size_t n) {
|
||||||
|
return (n + BITSET_MASK) >> BITSET_SHR;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
|
||||||
|
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
|
||||||
|
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||||
|
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
|
||||||
|
}
|
||||||
|
|
||||||
|
// hash set
|
||||||
|
|
||||||
|
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||||
|
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||||
|
|
||||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||||
|
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||||
|
|
||||||
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
// returns the minimum size for a hash set that can hold min_sz elements
|
||||||
|
size_t ggml_hash_size(size_t min_sz);
|
||||||
|
|
||||||
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
// remove all elements from the hash set
|
||||||
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
|
||||||
|
|
||||||
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
// returns true if key is in the hash set
|
||||||
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
|
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||||
|
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
|
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||||
|
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
// return index, asserts if table is full
|
// return index, asserts if table is full
|
||||||
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||||
|
|
||||||
|
// hash function for ggml_tensor
|
||||||
|
static inline size_t ggml_hash(const struct ggml_tensor * p) {
|
||||||
|
// the last 4 bits are always zero due to alignment
|
||||||
|
return (size_t)(uintptr_t)p >> 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||||
|
size_t h = ggml_hash(key) % hash_set->size;
|
||||||
|
|
||||||
|
// linear probing
|
||||||
|
size_t i = h;
|
||||||
|
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
|
||||||
|
i = (i + 1) % hash_set->size;
|
||||||
|
if (i == h) {
|
||||||
|
// visited all hash table entries -> not found
|
||||||
|
return GGML_HASHSET_FULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||||
|
size_t i = ggml_hash_find(hash_set, key);
|
||||||
|
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||||
|
size_t h = ggml_hash(key) % hash_set->size;
|
||||||
|
|
||||||
|
// linear probing
|
||||||
|
size_t i = h;
|
||||||
|
do {
|
||||||
|
if (!ggml_bitset_get(hash_set->used, i)) {
|
||||||
|
ggml_bitset_set(hash_set->used, i);
|
||||||
|
hash_set->keys[i] = key;
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
if (hash_set->keys[i] == key) {
|
||||||
|
return GGML_HASHSET_ALREADY_EXISTS;
|
||||||
|
}
|
||||||
|
i = (i + 1) % hash_set->size;
|
||||||
|
} while (i != h);
|
||||||
|
|
||||||
|
// visited all hash table entries -> not found
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||||
|
size_t h = ggml_hash(key) % hash_set->size;
|
||||||
|
|
||||||
|
// linear probing
|
||||||
|
size_t i = h;
|
||||||
|
do {
|
||||||
|
if (!ggml_bitset_get(hash_set->used, i)) {
|
||||||
|
ggml_bitset_set(hash_set->used, i);
|
||||||
|
hash_set->keys[i] = key;
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
if (hash_set->keys[i] == key) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
i = (i + 1) % hash_set->size;
|
||||||
|
} while (i != h);
|
||||||
|
|
||||||
|
// visited all hash table entries -> not found
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
|
||||||
}
|
}
|
||||||
if ((a % b) != 0) {
|
if ((a % b) != 0) {
|
||||||
fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
|
fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
|
||||||
GGML_ASSERT(!"safe_divide result would've had remainder");
|
GGML_ABORT("safe_divide result would've had remainder");
|
||||||
}
|
}
|
||||||
return a / b;
|
return a / b;
|
||||||
}
|
}
|
||||||
|
@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
|
|
||||||
if (!ggml_vk_supports_op(dst)) {
|
if (!ggml_vk_supports_op(dst)) {
|
||||||
fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
||||||
GGML_ASSERT(!"unsupported op");
|
GGML_ABORT("unsupported op");
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t ne00 = src0 ? src0->ne[0] : 0;
|
const int32_t ne00 = src0 ? src0->ne[0] : 0;
|
||||||
|
@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
continue;
|
continue;
|
||||||
not_implemented: {}
|
not_implemented: {}
|
||||||
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||||
//GGML_ASSERT(false);
|
//GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Evaluate sequence
|
// Evaluate sequence
|
||||||
|
|
|
@ -869,7 +869,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
|
||||||
GGML_ASSERT(!"capture failed");
|
GGML_ABORT("capture failed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -931,7 +931,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
|
|
||||||
if (!ggml_metal_supports_op(ctx, dst)) {
|
if (!ggml_metal_supports_op(ctx, dst)) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
|
||||||
GGML_ASSERT(!"unsupported op");
|
GGML_ABORT("unsupported op");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (should_capture) {
|
if (should_capture) {
|
||||||
|
@ -1068,7 +1068,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
|
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
|
||||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
|
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
|
||||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
|
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
|
||||||
default: GGML_ASSERT(false);
|
default: GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
bcast_row = true;
|
bcast_row = true;
|
||||||
|
@ -1077,7 +1077,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
|
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
|
||||||
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
|
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
|
||||||
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
|
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
|
||||||
default: GGML_ASSERT(false);
|
default: GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1131,7 +1131,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
|
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
|
||||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
|
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
|
||||||
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
|
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
|
||||||
default: GGML_ASSERT(false);
|
default: GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_SQR:
|
case GGML_OP_SQR:
|
||||||
|
@ -1609,7 +1609,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
|
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
|
||||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
|
||||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
|
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
|
||||||
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
default: GGML_ABORT("MUL MAT-MAT not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
@ -1782,14 +1782,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
||||||
GGML_ASSERT(false && "not implemented");
|
GGML_ABORT("not implemented");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (ggml_is_quantized(src0t)) {
|
|
||||||
GGML_ASSERT(ne00 >= nth0*nth1);
|
|
||||||
}
|
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
|
@ -1915,7 +1911,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
|
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
|
||||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
|
||||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
|
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
|
||||||
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
default: GGML_ABORT("MUL_MAT_ID not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
@ -2082,7 +2078,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||||
GGML_ASSERT(false && "not implemented");
|
GGML_ABORT("not implemented");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2182,7 +2178,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
|
||||||
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
|
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
|
||||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ABORT("not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
@ -2320,13 +2316,13 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
|
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
|
||||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
|
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
|
||||||
default: GGML_ASSERT(false);
|
default: GGML_ABORT("fatal error");
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
|
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
|
||||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
|
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
|
||||||
default: GGML_ASSERT(false);
|
default: GGML_ABORT("fatal error");
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2403,7 +2399,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
switch (dst->type) {
|
switch (dst->type) {
|
||||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
|
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
|
||||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
|
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
|
||||||
default: GGML_ASSERT(false);
|
default: GGML_ABORT("fatal error");
|
||||||
};
|
};
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
@ -2560,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
switch (order) {
|
switch (order) {
|
||||||
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
|
||||||
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
|
||||||
default: GGML_ASSERT(false);
|
default: GGML_ABORT("fatal error");
|
||||||
};
|
};
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
@ -2649,7 +2645,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||||
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
|
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
|
||||||
GGML_ASSERT(false && "add template specialization for this size");
|
GGML_ABORT("add template specialization for this size");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -2662,7 +2658,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||||
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
|
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
|
||||||
GGML_ASSERT(false && "add template specialization for this size");
|
GGML_ABORT("add template specialization for this size");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2783,7 +2779,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
|
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
|
||||||
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
|
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
|
||||||
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
|
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ABORT("not implemented");
|
||||||
};
|
};
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
@ -2791,10 +2787,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
switch (dstt) {
|
switch (dstt) {
|
||||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
|
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
|
||||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
|
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ABORT("not implemented");
|
||||||
};
|
};
|
||||||
} break;
|
} break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ABORT("not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:pipeline];
|
[encoder setComputePipelineState:pipeline];
|
||||||
|
@ -2822,7 +2818,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
|
||||||
GGML_ASSERT(false);
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4757,7 +4757,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||||
device const float4 * y4 = (device const float4 *)yb;
|
device const float4 * y4 = (device const float4 *)yb;
|
||||||
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
yl[0] = y4[0]; yl[1] = y4[4]; yl[2] = y4[1]; yl[3] = y4[5];
|
||||||
|
|
||||||
for (int row = 0; row < 2; ++row) {
|
for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
|
||||||
|
|
||||||
device const block_iq4_nl & xb = x[row*nb + ib];
|
device const block_iq4_nl & xb = x[row*nb + ib];
|
||||||
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
device const uint16_t * q4 = (device const uint16_t *)(xb.qs + 8*it);
|
||||||
|
@ -4789,7 +4789,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||||
yb += 16 * QK4_NL;
|
yb += 16 * QK4_NL;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int row = 0; row < 2; ++row) {
|
for (int row = 0; row < 2 && first_row + row < ne01; ++row) {
|
||||||
all_sum = simd_sum(sumf[row]);
|
all_sum = simd_sum(sumf[row]);
|
||||||
if (tiisg == 0) {
|
if (tiisg == 0) {
|
||||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue