Merge remote-tracking branch 'origin/master' into prefix-bos

This commit is contained in:
Xiao-Yong Jin 2023-07-24 21:51:48 -05:00
commit b12859937f
42 changed files with 4343 additions and 2190 deletions

15
.gitignore vendored
View file

@ -61,3 +61,18 @@ qnt-*.txt
perf-*.txt perf-*.txt
examples/jeopardy/results.txt examples/jeopardy/results.txt
pyproject.toml
poetry.lock
poetry.toml
# Test binaries
tests/test-double-float
tests/test-grad0
tests/test-opt
tests/test-quantize-fns
tests/test-quantize-perf
tests/test-sampling
tests/test-tokenizer-0

View file

@ -186,16 +186,7 @@ if (LLAMA_BLAS)
pkg_check_modules(DepBLAS REQUIRED flexiblas_api) pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel") elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path # all Intel* libraries share the same include path
pkg_check_modules(DepBLAS mkl-sdl) pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
if (NOT DepBLAS)
if (BUILD_SHARED_LIBS)
set(LINK_METHOD dynamic)
else()
set(LINK_METHOD static)
endif()
string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
endif()
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC") elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config # this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own # suggest to assign BLAS_INCLUDE_DIRS on your own

View file

@ -1,5 +1,8 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
# Binaries only useful for tests
TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
@ -90,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
CXXFLAGS += -pthread CXXFLAGS += -pthread
endif endif
# detect Windows
ifneq ($(findstring _NT,$(UNAME_S)),)
_WIN32 := 1
endif
# library name prefix
ifneq ($(_WIN32),1)
LIB_PRE := lib
endif
# Dynamic Shared Object extension
ifneq ($(_WIN32),1)
DSO_EXT := .so
else
DSO_EXT := .dll
endif
# Windows Sockets 2 (Winsock) for network-capable apps
ifeq ($(_WIN32),1)
LWINSOCK2 := -lws2_32
endif
ifdef LLAMA_GPROF ifdef LLAMA_GPROF
CFLAGS += -pg CFLAGS += -pg
CXXFLAGS += -pg CXXFLAGS += -pg
@ -102,7 +127,7 @@ endif
# Architecture specific # Architecture specific
# TODO: probably these flags need to be tweaked on some architectures # TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
# Use all CPU extensions that are available: # Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native CFLAGS += -march=native -mtune=native
CXXFLAGS += -march=native -mtune=native CXXFLAGS += -march=native -mtune=native
@ -168,8 +193,12 @@ ifdef LLAMA_CUBLAS
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler NVCCFLAGS = --forward-unknown-to-host-compiler
ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC)
else
NVCC = nvcc
endif #LLAMA_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
else else
@ -198,19 +227,23 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
else else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif endif
ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
CXXFLAGS += -DGGML_USE_CLBLAST CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
# Mac provides OpenCL as a framework # Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
LDFLAGS += -lclblast -framework OpenCL LDFLAGS += -lclblast -framework OpenCL
else else
LDFLAGS += -lclblast -lOpenCL LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
endif endif
OBJS += ggml-opencl.o OBJS += ggml-opencl.o
@ -290,17 +323,20 @@ llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
common.o: examples/common.cpp examples/common.h common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
$(CXX) $(CXXFLAGS) -c $< -o $@
libllama.so: llama.o ggml.o $(OBJS) libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
# #
# Examples # Examples
# #
main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS) main: examples/main/main.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo @echo
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@ -324,15 +360,15 @@ embedding: examples/embedding/embedding.cpp build-info.h ggml.
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@ -349,6 +385,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
# Tests # Tests
# #
tests: $(TEST_TARGETS)
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
./$@ ./$@
@ -356,6 +394,23 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
.PHONY: tests clean tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
tests: $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
bash ./tests/run-tests.sh
tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

View file

@ -242,6 +242,23 @@ In order to build llama.cpp you have three different options.
zig build -Doptimize=ReleaseFast zig build -Doptimize=ReleaseFast
``` ```
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
2. Add your user to **video** group
3. Install compilation dependencies.
```bash
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
opencl clblast openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
the instructions for use and activate this options in this document below.
### Metal Build ### Metal Build
Using Metal allows the computation to be executed on the GPU for Apple devices: Using Metal allows the computation to be executed on the GPU for Apple devices:
@ -360,7 +377,7 @@ Building the program with BLAS support may lead to some performance improvements
```bash ```bash
mkdir build mkdir build
cd build cd build
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake --build . --config Release cmake --build . --config Release
``` ```
@ -384,7 +401,7 @@ Building the program with BLAS support may lead to some performance improvements
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------| |-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |

View file

@ -16,5 +16,10 @@ It is a good practice, before publishing changes to execute the full CI locally
```bash ```bash
mkdir tmp mkdir tmp
# CPU-only build
bash ./ci/run.sh ./tmp/results ./tmp/mnt bash ./ci/run.sh ./tmp/results ./tmp/mnt
# with CUDA support
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
``` ```

179
ci/run.sh
View file

@ -1,4 +1,15 @@
#/bin/bash #/bin/bash
#
# sample usage:
#
# mkdir tmp
#
# # CPU-only build
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
if [ -z "$2" ]; then if [ -z "$2" ]; then
echo "usage: $0 <output-dir> <mnt-dir>" echo "usage: $0 <output-dir> <mnt-dir>"
@ -101,7 +112,7 @@ function gg_run_ctest_release {
(time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
if [ -z $GG_BUILD_LOW_PERF ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
else else
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -154,6 +165,7 @@ function gg_run_open_llama_3b_v2 {
model_q4_1="${path_models}/ggml-model-q4_1.bin" model_q4_1="${path_models}/ggml-model-q4_1.bin"
model_q5_0="${path_models}/ggml-model-q5_0.bin" model_q5_0="${path_models}/ggml-model-q5_0.bin"
model_q5_1="${path_models}/ggml-model-q5_1.bin" model_q5_1="${path_models}/ggml-model-q5_1.bin"
model_q2_k="${path_models}/ggml-model-q2_k.bin"
model_q3_k="${path_models}/ggml-model-q3_k.bin" model_q3_k="${path_models}/ggml-model-q3_k.bin"
model_q4_k="${path_models}/ggml-model-q4_k.bin" model_q4_k="${path_models}/ggml-model-q4_k.bin"
model_q5_k="${path_models}/ggml-model-q5_k.bin" model_q5_k="${path_models}/ggml-model-q5_k.bin"
@ -166,21 +178,23 @@ function gg_run_open_llama_3b_v2 {
./bin/quantize ${model_f16} ${model_q4_1} q4_1 ./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0 ./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1 ./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k ./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k ./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k ./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k ./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@ -188,6 +202,7 @@ function gg_run_open_llama_3b_v2 {
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
@ -212,6 +227,7 @@ function gg_run_open_llama_3b_v2 {
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@ -232,6 +248,133 @@ function gg_sum_open_llama_3b_v2 {
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)" gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)" gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)" gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
}
# open_llama_7b_v2
# requires: GG_BUILD_CUDA
function gg_run_open_llama_7b_v2 {
cd ${SRC}
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/open-llama/7B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.bin"
model_q8_0="${path_models}/ggml-model-q8_0.bin"
model_q4_0="${path_models}/ggml-model-q4_0.bin"
model_q4_1="${path_models}/ggml-model-q4_1.bin"
model_q5_0="${path_models}/ggml-model-q5_0.bin"
model_q5_1="${path_models}/ggml-model-q5_1.bin"
model_q2_k="${path_models}/ggml-model-q2_k.bin"
model_q3_k="${path_models}/ggml-model-q3_k.bin"
model_q4_k="${path_models}/ggml-model-q4_k.bin"
model_q5_k="${path_models}/ggml-model-q5_k.bin"
model_q6_k="${path_models}/ggml-model-q6_k.bin"
wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
set +e
}
function gg_sum_open_llama_7b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 7B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)" gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
@ -240,10 +383,10 @@ function gg_sum_open_llama_3b_v2 {
## main ## main
if [ -z $GG_BUILD_LOW_PERF ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
rm -rf ${SRC}/models-mnt rm -rf ${SRC}/models-mnt
mnt_models=$(realpath ${MNT}/models) mnt_models=${MNT}/models
mkdir -p ${mnt_models} mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt ln -sfn ${mnt_models} ${SRC}/models-mnt
@ -252,11 +395,15 @@ fi
ret=0 ret=0
#test $ret -eq 0 && gg_run ctest_debug test $ret -eq 0 && gg_run ctest_debug
#test $ret -eq 0 && gg_run ctest_release test $ret -eq 0 && gg_run ctest_release
if [ -z $GG_BUILD_LOW_PERF ]; then if [ -z ${GG_BUILD_LOW_PERF} ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2 test $ret -eq 0 && gg_run open_llama_3b_v2
else
test $ret -eq 0 && gg_run open_llama_7b_v2
fi
fi fi
exit $ret exit $ret

View file

@ -194,13 +194,38 @@ class Params:
n_layer = n_layer, n_layer = n_layer,
) )
# LLaMA v2 70B params.json
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
@staticmethod
def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path))
n_vocab = config["vocab_size"];
n_embd = config["dim"];
n_head = config["n_heads"];
n_layer = config["n_layers"];
n_mult = config["multiple_of"];
if n_vocab == -1:
n_vocab = model["tok_embeddings.weight"].shape[0]
return Params(
n_vocab = n_vocab,
n_embd = n_embd,
n_mult = n_mult,
n_head = n_head,
n_layer = n_layer,
)
@staticmethod @staticmethod
def load(model_plus: 'ModelPlus') -> 'Params': def load(model_plus: 'ModelPlus') -> 'Params':
hf_config_path = model_plus.paths[0].parent / "config.json"
orig_config_path = model_plus.paths[0].parent / "params.json" orig_config_path = model_plus.paths[0].parent / "params.json"
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
if hf_transformer_config_path.exists(): if hf_config_path.exists():
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path) params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
elif orig_config_path.exists():
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
else: else:
params = Params.guessed(model_plus.model) params = Params.guessed(model_plus.model)
@ -1036,8 +1061,7 @@ class OutputFile:
@staticmethod @staticmethod
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
of = OutputFile(fname_out) of = OutputFile(fname_out)
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0)
n_head=1, n_layer=0)
of = OutputFile(fname_out) of = OutputFile(fname_out)
of.write_file_header(params, file_type=GGMLFileType.AllF32) of.write_file_header(params, file_type=GGMLFileType.AllF32)
of.write_vocab(vocab) of.write_vocab(vocab)

View file

@ -13,6 +13,8 @@ set(TARGET common)
add_library(${TARGET} OBJECT add_library(${TARGET} OBJECT
common.h common.h
common.cpp common.cpp
grammar-parser.h
grammar-parser.cpp
) )
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)

View file

@ -2,21 +2,21 @@
set -e set -e
AI_NAME="${AI_NAME:-Miku}" AI_NAME="${AI_NAME:-Miku}"
MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}" MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
USER_NAME="${USER_NAME:-Anon}" USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use. # Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}" #N_THREAD="${N_THREAD:-4}"
CTX_SIZE="${CTX_SIZE:-4096}"
N_PREDICTS="${N_PREDICTS:-4096}" N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024 GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048 --ctx_size "$CTX_SIZE"
--keep -1 --keep -1
--repeat_last_n 256 --repeat_last_n 256
--repeat_penalty 1.17647 --repeat_penalty 1.17647
--temp 0.7 --temp 0.6
--top_k 40 --mirostat 2)
--top_p 0.5)
if [ -n "$N_THREAD" ]; then if [ -n "$N_THREAD" ]; then
GEN_OPTIONS+=(--threads "$N_THREAD") GEN_OPTIONS+=(--threads "$N_THREAD")
@ -24,16 +24,17 @@ fi
./main "${GEN_OPTIONS[@]}" \ ./main "${GEN_OPTIONS[@]}" \
--model "$MODEL" \ --model "$MODEL" \
--in-prefix " " \
--in-suffix "${AI_NAME}:" \
--n_predict "$N_PREDICTS" \ --n_predict "$N_PREDICTS" \
--color --interactive \ --color --interactive \
--reverse-prompt "${USER_NAME}:" \ --reverse-prompt "${USER_NAME}:" \
--prompt " --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help. ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad. ${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her. ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
The conversation is only between ${USER_NAME} and ${AI_NAME} The conversation is only between ${USER_NAME} and ${AI_NAME}.
The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice. The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
${AI_NAME} can only communicate through text, so she can't send images or videos. ${AI_NAME} can only communicate through text, so she can't send images or videos.

View file

@ -8,6 +8,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static const float rms_norm_eps = 1e-6f;
float frand() { float frand() {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
@ -562,7 +564,7 @@ struct ggml_tensor * forward(
// norm // norm
{ {
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
// cur = attention_norm*cur // cur = attention_norm*cur
cur = ggml_mul(ctx0, cur = ggml_mul(ctx0,
@ -685,7 +687,7 @@ struct ggml_tensor * forward(
// norm // norm
{ {
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
// cur = ffn_norm*cur // cur = ffn_norm*cur
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
@ -729,7 +731,7 @@ struct ggml_tensor * forward(
{ {
// inpL shape [n_embd,N,1,1] // inpL shape [n_embd,N,1,1]
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
// inpL = norm*inpL // inpL = norm*inpL
// inpL shape [n_embd,N,1,1] // inpL shape [n_embd,N,1,1]
@ -817,7 +819,7 @@ struct ggml_tensor * forward_batch(
// norm // norm
{ {
// cur shape [n_embd,N*n_batch,1,1] // cur shape [n_embd,N*n_batch,1,1]
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
assert_shape_2d(cur, n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch);
// cur = attention_norm*cur // cur = attention_norm*cur
@ -981,7 +983,7 @@ struct ggml_tensor * forward_batch(
// norm // norm
{ {
// cur shape [n_embd,N*n_batch,1,1] // cur shape [n_embd,N*n_batch,1,1]
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
assert_shape_2d(cur, n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch);
// cur = ffn_norm*cur // cur = ffn_norm*cur
@ -1034,7 +1036,7 @@ struct ggml_tensor * forward_batch(
{ {
// inpL shape [n_embd,N*n_batch,1,1] // inpL shape [n_embd,N*n_batch,1,1]
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
assert_shape_2d(inpL, n_embd, N*n_batch); assert_shape_2d(inpL, n_embd, N*n_batch);
// inpL = norm*inpL // inpL = norm*inpL
@ -1104,7 +1106,7 @@ struct ggml_tensor * forward_lora(
// norm // norm
{ {
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
// cur = attention_norm*cur // cur = attention_norm*cur
cur = ggml_mul(ctx0, cur = ggml_mul(ctx0,
@ -1251,7 +1253,7 @@ struct ggml_tensor * forward_lora(
// norm // norm
{ {
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
// cur = ffn_norm*cur // cur = ffn_norm*cur
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
@ -1295,7 +1297,7 @@ struct ggml_tensor * forward_lora(
{ {
// inpL shape [n_embd,N,1,1] // inpL shape [n_embd,N,1,1]
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
// inpL = norm*inpL // inpL = norm*inpL
// inpL shape [n_embd,N,1,1] // inpL shape [n_embd,N,1,1]

View file

@ -117,6 +117,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_threads = std::stoi(argv[i]); params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
}
} else if (arg == "-p" || arg == "--prompt") { } else if (arg == "-p" || arg == "--prompt") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -168,6 +171,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.n_ctx = std::stoi(argv[i]); params.n_ctx = std::stoi(argv[i]);
} else if (arg == "-gqa" || arg == "--gqa") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_gqa = std::stoi(argv[i]);
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rms_norm_eps = std::stof(argv[i]);
} else if (arg == "--rope-freq-base") { } else if (arg == "--rope-freq-base") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -260,12 +275,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.cfg_scale = std::stof(argv[i]); params.cfg_scale = std::stof(argv[i]);
} else if (arg == "--cfg-smooth-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cfg_smooth_factor = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch-size") { } else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -393,6 +402,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.antiprompt.push_back(argv[i]); params.antiprompt.push_back(argv[i]);
} else if (arg == "--perplexity") { } else if (arg == "--perplexity") {
params.perplexity = true; params.perplexity = true;
} else if (arg == "--perplexity-lines") {
params.perplexity_lines = true;
} else if (arg == "--ignore-eos") { } else if (arg == "--ignore-eos") {
params.logit_bias[llama_token_eos()] = -INFINITY; params.logit_bias[llama_token_eos()] = -INFINITY;
} else if (arg == "--no-penalize-nl") { } else if (arg == "--no-penalize-nl") {
@ -435,6 +446,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break; break;
} }
params.input_suffix = argv[i]; params.input_suffix = argv[i];
} else if (arg == "--grammar") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.grammar = argv[i];
} else if (arg == "--grammar-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(params.grammar)
);
} else { } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params); gpt_print_usage(argc, argv, default_params);
@ -464,92 +497,96 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stdout, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n"); fprintf(stdout, "\n");
fprintf(stderr, "options:\n"); fprintf(stdout, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stdout, " -h, --help show this help message and exit\n");
fprintf(stderr, " -i, --interactive run in interactive mode\n"); fprintf(stdout, " -i, --interactive run in interactive mode\n");
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n"); fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n");
fprintf(stderr, " (can be specified more than once for multiple prompts).\n"); fprintf(stdout, " (can be specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); fprintf(stdout, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n"); fprintf(stdout, " prompt to start generation with (default: empty)\n");
fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stderr, " not supported with --interactive or other interactive options\n"); fprintf(stdout, " not supported with --interactive or other interactive options\n");
fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); fprintf(stdout, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stdout, " -f FNAME, --file FNAME\n");
fprintf(stderr, " prompt file to start generation.\n"); fprintf(stdout, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
fprintf(stderr, " --mirostat N use Mirostat sampling.\n"); fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); fprintf(stdout, " --mirostat N use Mirostat sampling.\n");
fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n"); fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
fprintf(stderr, " --cfg-negative-prompt PROMPT \n"); fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n"); fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n");
fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
fprintf(stderr, " --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor); fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --grammar-file FNAME file to read grammar from\n");
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " --cfg-negative-prompt PROMPT \n");
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n");
fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) { if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
if (llama_mmap_supported()) { if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n"); fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n"); fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n"); fprintf(stdout, " number of layers to store in VRAM\n");
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
#endif #endif
fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stdout, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stdout, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n"); fprintf(stdout, "\n");
} }
std::string gpt_random_prompt(std::mt19937 & rng) { std::string gpt_random_prompt(std::mt19937 & rng) {
@ -587,9 +624,11 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.n_ctx = params.n_ctx; lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch; lparams.n_batch = params.n_batch;
lparams.n_gqa = params.n_gqa;
lparams.rms_norm_eps = params.rms_norm_eps;
lparams.n_gpu_layers = params.n_gpu_layers; lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram; lparams.low_vram = params.low_vram;
lparams.seed = params.seed; lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16; lparams.f16_kv = params.memory_f16;

View file

@ -27,12 +27,14 @@ struct gpt_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
float rms_norm_eps = 1e-6; // rms norm epsilon
float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
@ -47,7 +49,7 @@ struct gpt_params {
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled float presence_penalty = 0.00f; // 0.0 = disabled
int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate float mirostat_eta = 0.10f; // learning rate
@ -55,7 +57,6 @@ struct gpt_params {
// https://arxiv.org/abs/2306.17806 // https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance float cfg_scale = 1.f; // How strong is guidance
float cfg_smooth_factor = 1.f; // Smooth factor between old and new logits
std::string model = "models/7B/ggml-model.bin"; // model path std::string model = "models/7B/ggml-model.bin"; // model path
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias
@ -63,6 +64,7 @@ struct gpt_params {
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with std::string input_suffix = ""; // string to suffix user inputs with
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string lora_adapter = ""; // lora adapter path std::string lora_adapter = ""; // lora adapter path
@ -84,6 +86,7 @@ struct gpt_params {
bool instruct = false; // instruction mode (used for Alpaca models) bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt bool perplexity = false; // compute perplexity over the prompt
bool perplexity_lines = false; // compute perplexity over each line of the prompt
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage bool mem_test = false; // compute maximum memory usage

View file

@ -64,7 +64,7 @@ class MiniGPT4(Blip2Base):
self.max_txt_len = max_txt_len self.max_txt_len = max_txt_len
self.end_sym = end_sym self.end_sym = end_sym
self.model = MyModel(["main", *args]) self.model = MyModel(["main", *args])
# system promt # system prompt
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. " self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
"You will be able to see the image once I provide it to you. Please answer my questions." "You will be able to see the image once I provide it to you. Please answer my questions."
"###") "###")

423
examples/grammar-parser.cpp Normal file
View file

@ -0,0 +1,423 @@
#include "grammar-parser.h"
#include <cstdint>
#include <cwchar>
#include <string>
#include <utility>
#include <stdexcept>
#include <exception>
namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4;
int len = lookup[highbits];
uint8_t mask = (1 << (8 - len)) - 1;
uint32_t value = first_byte & mask;
const char * end = src + len; // may overrun!
const char * pos = src + 1;
for ( ; pos < end && *pos; pos++) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
}
return std::make_pair(value, pos);
}
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second;
}
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id;
}
void add_rule(
parse_state & state,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) {
if (state.rules.size() <= rule_id) {
state.rules.resize(rule_id + 1);
}
state.rules[rule_id] = rule;
}
bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
}
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src;
const char * end = src + size;
uint32_t value = 0;
for ( ; pos < end && *pos; pos++) {
value <<= 4;
char c = *pos;
if ('a' <= c && c <= 'f') {
value += c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
value += c - 'A' + 10;
} else if ('0' <= c && c <= '9') {
value += c - '0';
} else {
break;
}
}
if (pos != end) {
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
}
return std::make_pair(value, pos);
}
const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
if (*pos == '#') {
while (*pos && *pos != '\r' && *pos != '\n') {
pos++;
}
} else {
pos++;
}
}
return pos;
}
const char * parse_name(const char * src) {
const char * pos = src;
while (is_word_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting name at ") + src);
}
return pos;
}
std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
case 'x': return parse_hex(src + 2, 2);
case 'u': return parse_hex(src + 2, 4);
case 'U': return parse_hex(src + 2, 8);
case 't': return std::make_pair('\t', src + 2);
case 'r': return std::make_pair('\r', src + 2);
case 'n': return std::make_pair('\n', src + 2);
case '\\':
case '"':
case '[':
case ']':
return std::make_pair(src[1], src + 2);
default:
throw std::runtime_error(std::string("unknown escape at ") + src);
}
} else if (*src) {
return decode_utf8(src);
}
throw std::runtime_error("unexpected end of input");
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
const char * parse_sequence(
parse_state & state,
const char * src,
const std::string & rule_name,
std::vector<llama_grammar_element> & out_elements,
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = out_elements.size();
while (*pos != '"') {
auto char_pair = parse_char(pos);
pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s)
pos++;
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
if (*pos == '^') {
pos++;
start_type = LLAMA_GRETYPE_CHAR_NOT;
}
last_sym_start = out_elements.size();
while (*pos != ']') {
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size()
? LLAMA_GRETYPE_CHAR_ALT
: start_type;
out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
}
}
pos = parse_space(pos + 1, is_nested);
} else if (is_word_char(*pos)) { // rule reference
const char * name_end = parse_name(pos);
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
last_sym_start = out_elements.size();
// output reference to synthesized rule
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// rewrite rules:
// S* --> S' ::= S S' |
// S+ --> S' ::= S S' | S
// S? --> S' ::= S |
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
std::vector<llama_grammar_element> sub_rule;
// add preceding symbol to generated rule
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
if (*pos == '*' || *pos == '+') {
// cause generated rule to recurse
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
}
// mark start of alternate def
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
if (*pos == '+') {
// add preceding symbol as alternate only for '+' (otherwise empty)
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
}
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, sub_rule_id, sub_rule);
// in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start);
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
pos = parse_space(pos + 1, is_nested);
} else {
break;
}
}
return pos;
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested) {
std::vector<llama_grammar_element> rule;
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
while (*pos == '|') {
rule.push_back({LLAMA_GRETYPE_ALT, 0});
pos = parse_space(pos + 1, true);
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
}
rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rule_id, rule);
return pos;
}
const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
uint32_t rule_id = get_symbol_id(state, src, name_len);
const std::string name(src, name_len);
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
throw std::runtime_error(std::string("expecting ::= at ") + pos);
}
pos = parse_space(pos + 3, true);
pos = parse_alternates(state, pos, name, rule_id, false);
if (*pos == '\r') {
pos += pos[1] == '\n' ? 2 : 1;
} else if (*pos == '\n') {
pos++;
} else if (*pos) {
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
}
return parse_space(pos, true);
}
parse_state parse(const char * src) {
try {
parse_state state;
const char * pos = parse_space(src, true);
while (*pos) {
pos = parse_rule(state, pos);
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
return parse_state();
}
}
void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c));
} else {
// cop out of encoding UTF-8
fprintf(file, "<U+%04X>", c);
}
}
bool is_char_element(llama_grammar_element elem) {
switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
default: return false;
}
}
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) {
switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
case LLAMA_GRETYPE_ALT:
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "(%u) ", elem.value);
break;
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
break;
}
}
fprintf(file, "\n");
}
void print_rule(
FILE * file,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule,
const std::map<uint32_t, std::string> & symbol_id_names) {
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
throw std::runtime_error(
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
}
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
llama_grammar_element elem = rule[i];
switch (elem.type) {
case LLAMA_GRETYPE_END:
throw std::runtime_error(
"unexpected end of rule: " + std::to_string(rule_id) + "," +
std::to_string(i));
case LLAMA_GRETYPE_ALT:
fprintf(file, "| ");
break;
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
break;
case LLAMA_GRETYPE_CHAR:
fprintf(file, "[");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_NOT:
fprintf(file, "[^");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
fprintf(file, "-");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ALT:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
print_grammar_char(file, elem.value);
break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
break;
default:
fprintf(file, "] ");
}
}
}
fprintf(file, "\n");
}
void print_grammar(FILE * file, const parse_state & state) {
try {
std::map<uint32_t, std::string> symbol_id_names;
for (auto kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first;
}
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
// fprintf(file, "%zu: ", i);
// print_rule_binary(file, state.rules[i]);
print_rule(file, i, state.rules[i], symbol_id_names);
// fprintf(file, "\n");
}
} catch (const std::exception & err) {
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
}
}
std::vector<const llama_grammar_element *> parse_state::c_rules() {
std::vector<const llama_grammar_element *> ret;
for (const auto & rule : rules) {
ret.push_back(rule.data());
}
return ret;
}
}

29
examples/grammar-parser.h Normal file
View file

@ -0,0 +1,29 @@
// Implements a parser for an extended Backus-Naur form (BNF), producing the
// binary context-free grammar format specified by llama.h. Supports character
// ranges, grouping, and repetition operators. As an example, a grammar for
// arithmetic might look like:
//
// root ::= expr
// expr ::= term ([-+*/] term)*
// term ::= num | "(" space expr ")" space
// num ::= [0-9]+ space
// space ::= [ \t\n]*
#pragma once
#include "llama.h"
#include <vector>
#include <map>
#include <cstdint>
#include <string>
namespace grammar_parser {
struct parse_state {
std::map<std::string, uint32_t> symbol_ids;
std::vector<std::vector<llama_grammar_element>> rules;
std::vector<const llama_grammar_element *> c_rules();
};
parse_state parse(const char * src);
void print_grammar(FILE * file, const parse_state & state);
}

18
examples/llama2-13b.sh Executable file
View file

@ -0,0 +1,18 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

18
examples/llama2.sh Executable file
View file

@ -0,0 +1,18 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

23
examples/llm.vim Normal file
View file

@ -0,0 +1,23 @@
function! Llm()
let url = "http://127.0.0.1:8080/completion"
" Get the content of the current buffer
let buffer_content = join(getline(1, '$'), "\n")
" Create the JSON payload
let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
let json_payload.prompt = buffer_content
" Define the curl command
let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
let response = system(curl_command, json_encode(json_payload))
" Extract the content field from the response
let content = json_decode(response).content
" Insert the content at the cursor position
call setline(line('.'), getline('.') . content)
endfunction
command! Llm call Llm()

View file

@ -6,6 +6,7 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h" #include "build-info.h"
#include "grammar-parser.h"
#include <cassert> #include <cassert>
#include <cinttypes> #include <cinttypes>
@ -93,8 +94,8 @@ int main(int argc, char ** argv) {
} }
if (params.n_ctx > 2048) { if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);" // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
" you are on your own\n", __func__, params.n_ctx); fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) { } else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8; params.n_ctx = 8;
@ -139,17 +140,14 @@ int main(int argc, char ** argv) {
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
} }
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
// uncomment the "used_mem" line in llama.cpp to see the results // uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) { if (params.mem_test) {
{ {
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos()); fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
}
{ const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
const std::vector<llama_token> tmp = { 0, }; llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
} }
llama_print_timings(ctx); llama_print_timings(ctx);
@ -344,6 +342,31 @@ int main(int argc, char ** argv) {
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n"); fprintf(stderr, "\n\n");
grammar_parser::parse_state parsed_grammar;
llama_grammar * grammar = NULL;
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
fprintf(stderr, "%s: grammar:\n", __func__);
grammar_parser::print_grammar(stderr, parsed_grammar);
fprintf(stderr, "\n");
{
auto it = params.logit_bias.find(llama_token_eos());
if (it != params.logit_bias.end() && it->second == -INFINITY) {
fprintf(stderr,
"%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
}
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
// TODO: replace with ring-buffer // TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx); std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
@ -561,7 +584,7 @@ int main(int argc, char ** argv) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
if (ctx_guidance) { if (ctx_guidance) {
llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor); llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
} }
// Apply penalties // Apply penalties
@ -577,6 +600,10 @@ int main(int argc, char ** argv) {
logits[llama_token_nl()] = nl_logit; logits[llama_token_nl()] = nl_logit;
} }
if (grammar != NULL) {
llama_sample_grammar(ctx, &candidates_p, grammar);
}
if (temp <= 0) { if (temp <= 0) {
// Greedy sampling // Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p); id = llama_sample_token_greedy(ctx, &candidates_p);
@ -602,6 +629,10 @@ int main(int argc, char ** argv) {
} }
// printf("`%d`", candidates_p.size); // printf("`%d`", candidates_p.size);
if (grammar != NULL) {
llama_grammar_accept_token(ctx, grammar, id);
}
last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id); last_n_tokens.push_back(id);
} }
@ -745,6 +776,18 @@ int main(int argc, char ** argv) {
} }
if (n_past > 0) { if (n_past > 0) {
if (is_interacting) {
// reset grammar state if we're restarting generation
if (grammar != NULL) {
llama_grammar_free(grammar);
std::vector<const llama_grammar_element *> grammar_rules(
parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(),
parsed_grammar.symbol_ids.at("root"));
}
}
is_interacting = false; is_interacting = false;
} }
} }
@ -772,6 +815,9 @@ int main(int argc, char ** argv) {
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
if (grammar != NULL) {
llama_grammar_free(grammar);
}
llama_backend_free(); llama_backend_free();
return 0; return 0;

92
examples/make-ggml.py Normal file
View file

@ -0,0 +1,92 @@
"""
This script converts Hugging Face llama models to GGML and quantizes them.
Usage:
python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
Arguments:
- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
Quant types:
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
- Q2_K: smallest, extreme quality loss - not recommended
- Q3_K: alias for Q3_K_M
- Q3_K_S: very small, very high quality loss
- Q3_K_M: very small, very high quality loss
- Q3_K_L: small, substantial quality loss
- Q4_K: alias for Q4_K_M
- Q4_K_S: small, significant quality loss
- Q4_K_M: medium, balanced quality - recommended
- Q5_K: alias for Q5_K_M
- Q5_K_S: large, low quality loss - recommended
- Q5_K_M: large, very low quality loss - recommended
- Q6_K: very large, extremely low quality loss
- Q8_0: very large, extremely low quality loss - not recommended
- F16: extremely large, virtually no quality loss - not recommended
- F32: absolutely huge, lossless - not recommended
"""
import subprocess
subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
import argparse
import os
from huggingface_hub import snapshot_download
def main(model, outname, outdir, quants, keep_fp16):
ggml_version = "v3"
if not os.path.isdir(model):
print(f"Model not found at {model}. Downloading...")
try:
if outname is None:
outname = model.split('/')[-1]
model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
except Exception as e:
raise Exception(f"Could not download the model: {e}")
if outdir is None:
outdir = f'../models/{outname}'
if not os.path.isfile(f"{model}/config.json"):
raise Exception(f"Could not find config.json in {model}")
os.makedirs(outdir, exist_ok=True)
print("Building llama.cpp")
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
print(f"Making unquantised GGML at {fp16}")
if not os.path.isfile(fp16):
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
else:
print(f"Unquantised GGML already exists at: {fp16}")
print("Making quants")
for type in quants:
outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
print(f"Making {type} : {outfile}")
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
if not keep_fp16:
os.remove(fp16)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
parser.add_argument('--outname', default=None, help='Output model(s) name')
parser.add_argument('--outdir', default=None, help='Output directory')
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
args = parser.parse_args()
main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)

View file

@ -4,6 +4,7 @@
#include <cmath> #include <cmath>
#include <ctime> #include <ctime>
#include <sstream>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -120,6 +121,77 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
printf("\n"); printf("\n");
} }
void perplexity_lines(llama_context * ctx, const gpt_params & params) {
// Calculates perplexity over each line of the prompt
std::vector<std::string> prompt_lines;
std::istringstream strstream(params.prompt);
std::string line;
while (std::getline(strstream,line,'\n')) {
prompt_lines.push_back(line);
}
const int n_vocab = llama_n_vocab(ctx);
int counttotal = 0;
size_t n_lines = prompt_lines.size();
double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %lu lines\n", __func__, n_lines);
printf("\nLine\tPPL line\tPPL cumulative\n");
for (size_t i = 0; i < n_lines; ++i) {
// Tokenize and insert BOS at start
std::vector<int> batch_embd = ::llama_tokenize(ctx, prompt_lines[i], true);
size_t batch_size = batch_embd.size();
// Stop if line is too long
if( batch_size > (size_t)params.n_ctx ) {
fprintf(stderr, "%s : tokens in line %lu > n_ctxl\n", __func__, i);
return;
}
if (llama_eval(ctx, batch_embd.data(), batch_size, 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
const auto batch_logits = llama_get_logits(ctx);
std::vector<float> logits;
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
double nllline = 0.0;
int countline = 0;
// Perplexity over second half of the line
for (size_t j = batch_size/2; j < batch_size - 1; ++j) {
// Calculate probability of next token, given the previous ones.
const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
const float prob = softmax(tok_logits)[batch_embd[ j + 1]];
nllline += -std::log(prob);
++countline;
}
nll += nllline;
counttotal += countline;
// perplexity is e^(average negative log-likelihood)
printf("%lu\t%.8lf\t%.8lf\n", i + 1, std::exp(nllline/countline), std::exp(nll / counttotal) );
fflush(stdout);
}
printf("\n");
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
@ -168,7 +240,11 @@ int main(int argc, char ** argv) {
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
} }
if (params.perplexity_lines) {
perplexity_lines(ctx, params);
} else {
perplexity(ctx, params); perplexity(ctx, params);
}
llama_print_timings(ctx); llama_print_timings(ctx);
llama_free(ctx); llama_free(ctx);

View file

@ -7,6 +7,9 @@ target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()
target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO) if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO)

File diff suppressed because it is too large Load diff

View file

@ -73,6 +73,37 @@
margin: 0; margin: 0;
} }
fieldset.two {
display: grid;
grid-template: "a a";
gap: 1em;
}
fieldset.three {
display: grid;
grid-template: "a a a";
gap: 1em;
}
details {
border: 1px solid #aaa;
border-radius: 4px;
padding: 0.5em 0.5em 0;
margin-top: 0.5em;
}
summary {
font-weight: bold;
margin: -0.5em -0.5em 0;
padding: 0.5em;
cursor: pointer;
}
details[open] {
padding: 0.5em;
}
textarea { textarea {
padding: 5px; padding: 5px;
flex-grow: 1; flex-grow: 1;
@ -125,10 +156,17 @@
const params = signal({ const params = signal({
n_predict: 400, n_predict: 400,
temperature: 0.7, temperature: 0.7,
repeat_last_n: 256, repeat_last_n: 256, // 0 = disable penalty, -1 = context size
repeat_penalty: 1.18, repeat_penalty: 1.18, // 1.0 = disabled
top_k: 40, top_k: 40, // <= 0 to use vocab size
top_p: 0.5, top_p: 0.5, // 1.0 = disabled
tfs_z: 1.0, // 1.0 = disabled
typical_p: 1.0, // 1.0 = disabled
presence_penalty: 0.0, // 0.0 = disabled
frequency_penalty: 0.0, // 0.0 = disabled
mirostat: 0, // 0/1/2
mirostat_tau: 5, // target entropy
mirostat_eta: 0.1, // learning rate
}) })
const llamaStats = signal(null) const llamaStats = signal(null)
@ -264,6 +302,27 @@
const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value } const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value } const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) } const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
const FloatField = ({label, max, min, name, step, value}) => {
return html`
<div>
<label for="${name}">${label}</label>
<input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
<span>${value}</span>
</div>
`
};
const IntField = ({label, max, min, name, value}) => {
return html`
<div>
<label for="${name}">${label}</label>
<input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
<span>${value}</span>
</div>
`
};
return html` return html`
<form> <form>
@ -272,7 +331,9 @@
<label for="prompt">Prompt</label> <label for="prompt">Prompt</label>
<textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/> <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
</div> </div>
</fieldset>
<fieldset class="two">
<div> <div>
<label for="user">User name</label> <label for="user">User name</label>
<input type="text" name="user" value="${session.value.user}" oninput=${updateSession} /> <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
@ -282,7 +343,9 @@
<label for="bot">Bot name</label> <label for="bot">Bot name</label>
<input type="text" name="char" value="${session.value.char}" oninput=${updateSession} /> <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
</div> </div>
</fieldset>
<fieldset>
<div> <div>
<label for="template">Prompt template</label> <label for="template">Prompt template</label>
<textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/> <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
@ -292,32 +355,35 @@
<label for="template">Chat history template</label> <label for="template">Chat history template</label>
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/> <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
</div> </div>
<div>
<label for="temperature">Temperature</label>
<input type="range" id="temperature" min="0.0" max="1.0" step="0.01" name="temperature" value="${params.value.temperature}" oninput=${updateParamsFloat} />
<span>${params.value.temperature}</span>
</div>
<div>
<label for="nPredict">Predictions</label>
<input type="range" id="nPredict" min="1" max="2048" step="1" name="n_predict" value="${params.value.n_predict}" oninput=${updateParamsFloat} />
<span>${params.value.n_predict}</span>
</div>
<div>
<label for="repeat_penalty">Penalize repeat sequence</label>
<input type="range" id="repeat_penalty" min="0.0" max="2.0" step="0.01" name="repeat_penalty" value="${params.value.repeat_penalty}" oninput=${updateParamsFloat} />
<span>${params.value.repeat_penalty}</span>
</div>
<div>
<label for="repeat_last_n">Consider N tokens for penalize</label>
<input type="range" id="repeat_last_n" min="0.0" max="2048" name="repeat_last_n" value="${params.value.repeat_last_n}" oninput=${updateParamsFloat} />
<span>${params.value.repeat_last_n}</span>
</div>
</fieldset> </fieldset>
<fieldset class="two">
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
</fieldset>
<details>
<summary>More options</summary>
<fieldset class="two">
${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
</fieldset>
<hr />
<fieldset class="three">
<div>
<label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
<label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
<label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
</div>
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
</fieldset>
</details>
</form> </form>
` `
} }

View file

@ -601,47 +601,48 @@ struct llama_server_context
static void server_print_usage(const char *argv0, const gpt_params &params, static void server_print_usage(const char *argv0, const gpt_params &params,
const server_params &sparams) const server_params &sparams)
{ {
fprintf(stderr, "usage: %s [options]\n", argv0); fprintf(stdout, "usage: %s [options]\n", argv0);
fprintf(stderr, "\n"); fprintf(stdout, "\n");
fprintf(stderr, "options:\n"); fprintf(stdout, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stdout, " -h, --help show this help message and exit\n");
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_mlock_supported()) if (llama_mlock_supported())
{ {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
if (llama_mmap_supported()) if (llama_mmap_supported())
{ {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n"); fprintf(stdout, " number of layers to store in VRAM\n");
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
#endif #endif
fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " -a ALIAS, --alias ALIAS\n"); fprintf(stdout, " -a ALIAS, --alias ALIAS\n");
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n"); fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port);
fprintf(stderr, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
fprintf(stderr, "\n"); fprintf(stdout, "\n");
} }
static void server_params_parse(int argc, char **argv, server_params &sparams, static void server_params_parse(int argc, char **argv, server_params &sparams,
@ -724,9 +725,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.n_ctx = std::stoi(argv[i]); params.n_ctx = std::stoi(argv[i]);
} }
else if (arg == "-gqa" || arg == "--gqa")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_gqa = std::stoi(argv[i]);
}
else if (arg == "--rope-freq-base") else if (arg == "--rope-freq-base")
{ {
if (++i >= argc) { if (++i >= argc)
{
invalid_param = true; invalid_param = true;
break; break;
} }
@ -734,7 +745,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
else if (arg == "--rope-freq-scale") else if (arg == "--rope-freq-scale")
{ {
if (++i >= argc) { if (++i >= argc)
{
invalid_param = true; invalid_param = true;
break; break;
} }

View file

@ -16,6 +16,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static const float rms_norm_eps = 1e-6f;
struct random_normal_distribution { struct random_normal_distribution {
std::mt19937 gen; std::mt19937 gen;
std::normal_distribution<float> rd; std::normal_distribution<float> rd;
@ -439,7 +441,7 @@ struct ggml_tensor * forward(
// norm // norm
{ {
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
// cur = attention_norm*cur // cur = attention_norm*cur
cur = ggml_mul(ctx0, cur = ggml_mul(ctx0,
@ -562,7 +564,7 @@ struct ggml_tensor * forward(
// norm // norm
{ {
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
// cur = ffn_norm*cur // cur = ffn_norm*cur
// cur shape [n_embd,N,1,1] // cur shape [n_embd,N,1,1]
@ -606,7 +608,7 @@ struct ggml_tensor * forward(
{ {
// inpL shape [n_embd,N,1,1] // inpL shape [n_embd,N,1,1]
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
// inpL = norm*inpL // inpL = norm*inpL
// inpL shape [n_embd,N,1,1] // inpL shape [n_embd,N,1,1]
@ -694,7 +696,7 @@ struct ggml_tensor * forward_batch(
// norm // norm
{ {
// cur shape [n_embd,N*n_batch,1,1] // cur shape [n_embd,N*n_batch,1,1]
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
assert_shape_2d(cur, n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch);
// cur = attention_norm*cur // cur = attention_norm*cur
@ -857,7 +859,7 @@ struct ggml_tensor * forward_batch(
// norm // norm
{ {
// cur shape [n_embd,N*n_batch,1,1] // cur shape [n_embd,N*n_batch,1,1]
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
assert_shape_2d(cur, n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch);
// cur = ffn_norm*cur // cur = ffn_norm*cur
@ -910,7 +912,7 @@ struct ggml_tensor * forward_batch(
{ {
// inpL shape [n_embd,N*n_batch,1,1] // inpL shape [n_embd,N*n_batch,1,1]
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
assert_shape_2d(inpL, n_embd, N*n_batch); assert_shape_2d(inpL, n_embd, N*n_batch);
// inpL = norm*inpL // inpL = norm*inpL
@ -979,7 +981,7 @@ struct ggml_tensor * forward_batch_wo_cache(
// norm // norm
{ {
// cur shape [n_embd,N*n_batch,1,1] // cur shape [n_embd,N*n_batch,1,1]
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
assert_shape_2d(cur, n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch);
// cur = attention_norm*cur // cur = attention_norm*cur
@ -1085,7 +1087,7 @@ struct ggml_tensor * forward_batch_wo_cache(
// norm // norm
{ {
// cur shape [n_embd,N*n_batch,1,1] // cur shape [n_embd,N*n_batch,1,1]
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
assert_shape_2d(cur, n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch);
// cur = ffn_norm*cur // cur = ffn_norm*cur
@ -1138,7 +1140,7 @@ struct ggml_tensor * forward_batch_wo_cache(
{ {
// inpL shape [n_embd,N*n_batch,1,1] // inpL shape [n_embd,N*n_batch,1,1]
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
assert_shape_2d(inpL, n_embd, N*n_batch); assert_shape_2d(inpL, n_embd, N*n_batch);
// inpL = norm*inpL // inpL = norm*inpL
@ -1203,7 +1205,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
// norm // norm
{ {
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
assert_shape_2d(cur, n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch);
// cur = attention_norm*cur // cur = attention_norm*cur
@ -1267,7 +1269,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
{ {
// norm // norm
{ {
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
assert_shape_2d(cur, n_embd, N*n_batch); assert_shape_2d(cur, n_embd, N*n_batch);
// cur = ffn_norm*cur // cur = ffn_norm*cur
@ -1311,7 +1313,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
// norm // norm
{ {
inpL = ggml_rms_norm(ctx0, inpL); inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
assert_shape_2d(inpL, n_embd, N*n_batch); assert_shape_2d(inpL, n_embd, N*n_batch);
// inpL = norm*inpL // inpL = norm*inpL
@ -1434,7 +1436,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
gf->perf_time_us = 0; gf->perf_time_us = 0;
const auto & hparams = model->hparams; const auto & hparams = model->hparams;
//const int n_ctx = hparams.n_ctx; const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
const int n_embd = hparams.n_embd; const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer; const int n_layer = hparams.n_layer;
@ -1603,7 +1605,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
struct my_llama_layer & layer = model->layers[il]; struct my_llama_layer & layer = model->layers[il];
// tensors with values necessary for backward pass are in persistent buf(-1) // tensors with values necessary for backward pass are in persistent buf(-1)
// other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed. // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t02, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t02, n_embd, N*n_batch);
use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch);
@ -1623,7 +1625,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch);
use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21)); assert_shape_2d(t22, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21, rms_norm_eps)); assert_shape_2d(t22, n_embd, N*n_batch);
use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch);
use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch);
@ -1666,7 +1668,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
} }
clr_buf(0); clr_buf(0);
use_buf(0); use_buf(0);
struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur)); assert_shape_2d(t31, n_embd, N*n_batch); struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t31, n_embd, N*n_batch);
struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch);
struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch);
use_buf(-1); use_buf(-1);
@ -1863,10 +1865,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd); t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd);
t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch); t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch);
t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch); t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch);
t04->grad = expand(gb, ggml_add_inplace(ctx0, t04->grad = expand(gb, ggml_add_inplace(ctx0,
ggml_add_inplace(ctx0, ggml_add_inplace(ctx0,

View file

@ -6,8 +6,9 @@
outputs = { self, nixpkgs, flake-utils }: outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system: flake-utils.lib.eachDefaultSystem (system:
let let
inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin; inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
osSpecific = with pkgs; [ openmpi ] ++ buildInputs = with pkgs; [ openmpi ];
osSpecific = with pkgs; buildInputs ++
( (
if isAarch64 && isDarwin then if isAarch64 && isDarwin then
with pkgs.darwin.apple_sdk_11_0.frameworks; [ with pkgs.darwin.apple_sdk_11_0.frameworks; [
@ -22,49 +23,51 @@
CoreGraphics CoreGraphics
CoreVideo CoreVideo
] ]
else if isx86_32 || isx86_64 then
with pkgs; [ mkl ]
else else
with pkgs; [ openblas ] with pkgs; [ openblas ]
); );
pkgs = import nixpkgs { inherit system; }; pkgs = import nixpkgs { inherit system; };
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
llama-python = llama-python =
pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]); pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
postPatch = ''
substituteInPlace ./ggml-metal.m \
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
'';
postInstall = ''
mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server
'';
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
in { in {
packages.default = pkgs.stdenv.mkDerivation { packages.default = pkgs.stdenv.mkDerivation {
name = "llama.cpp"; name = "llama.cpp";
src = ./.; src = ./.;
postPatch = '' postPatch = postPatch;
substituteInPlace ./ggml-metal.m \ nativeBuildInputs = nativeBuildInputs;
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
'';
nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
buildInputs = osSpecific; buildInputs = osSpecific;
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ] cmakeFlags = cmakeFlags
++ (if isAarch64 && isDarwin then [ ++ (if isAarch64 && isDarwin then [
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
"-DLLAMA_METAL=ON" "-DLLAMA_METAL=ON"
] else if isx86_32 || isx86_64 then [
"-DLLAMA_BLAS=ON"
"-DLLAMA_BLAS_VENDOR=Intel10_lp64"
] else [ ] else [
"-DLLAMA_BLAS=ON" "-DLLAMA_BLAS=ON"
"-DLLAMA_BLAS_VENDOR=OpenBLAS" "-DLLAMA_BLAS_VENDOR=OpenBLAS"
]); ]);
installPhase = '' postInstall = postInstall;
runHook preInstall meta.mainProgram = "llama";
};
install -D bin/* -t $out/bin packages.opencl = pkgs.stdenv.mkDerivation {
install -Dm644 lib*.so -t $out/lib name = "llama.cpp";
mv $out/bin/main $out/bin/llama src = ./.;
mv $out/bin/server $out/bin/llama-server postPatch = postPatch;
nativeBuildInputs = nativeBuildInputs;
echo "#!${llama-python}/bin/python" > $out/bin/convert.py buildInputs = with pkgs; buildInputs ++ [ clblast ];
cat ${./convert.py} >> $out/bin/convert.py cmakeFlags = cmakeFlags ++ [
chmod +x $out/bin/convert.py "-DLLAMA_CLBLAST=ON"
];
runHook postInstall postInstall = postInstall;
'';
meta.mainProgram = "llama"; meta.mainProgram = "llama";
}; };
apps.llama-server = { apps.llama-server = {
@ -81,7 +84,7 @@
}; };
apps.default = self.apps.${system}.llama; apps.default = self.apps.${system}.llama;
devShells.default = pkgs.mkShell { devShells.default = pkgs.mkShell {
packages = with pkgs; [ cmake llama-python ] ++ osSpecific; packages = nativeBuildInputs ++ osSpecific;
}; };
}); });
} }

View file

@ -220,7 +220,7 @@ typedef struct {
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
#define WARP_SIZE 32 #define WARP_SIZE 32
#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
#define CUDA_ADD_BLOCK_SIZE 256 #define CUDA_ADD_BLOCK_SIZE 256
#define CUDA_MUL_BLOCK_SIZE 256 #define CUDA_MUL_BLOCK_SIZE 256
@ -332,12 +332,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
} }
} }
static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) { static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
const int row = blockIdx.x*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x; const int tid = threadIdx.x;
const float eps = 1e-6f;
float tmp = 0.0f; // partial sum for thread in warp float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += WARP_SIZE) { for (int col = tid; col < ncols; col += WARP_SIZE) {
@ -935,12 +933,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
uint16_t aux[4]; uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux; const uint8_t * sc = (const uint8_t *)aux;
#if K_QUANTS_PER_ITERATION == 2
uint32_t q32[4];
const uint8_t * q4 = (const uint8_t *)q32;
#else
uint16_t q16[4];
const uint8_t * q4 = (const uint8_t *)q16;
#endif
float tmp = 0; // partial sum for thread in warp float tmp = 0; // partial sum for thread in warp
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
const uint8_t * q1 = x[i].qs + q_offset;
const uint8_t * q2 = q1 + 64;
const float * y1 = yy + i*QK_K + y_offset; const float * y1 = yy + i*QK_K + y_offset;
const float * y2 = y1 + 128; const float * y2 = y1 + 128;
@ -953,14 +957,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
#if K_QUANTS_PER_ITERATION == 2
const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
const uint32_t * q2 = q1 + 16;
q32[0] = q1[0] & 0x0f0f0f0f;
q32[1] = q1[0] & 0xf0f0f0f0;
q32[2] = q2[0] & 0x0f0f0f0f;
q32[3] = q2[0] & 0xf0f0f0f0;
float4 s = {0.f, 0.f, 0.f, 0.f}; float4 s = {0.f, 0.f, 0.f, 0.f};
float smin = 0; float smin = 0;
for (int l = 0; l < n; ++l) { for (int l = 0; l < 4; ++l) {
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
} }
tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
#else
const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
const uint16_t * q2 = q1 + 32;
q16[0] = q1[0] & 0x0f0f;
q16[1] = q1[0] & 0xf0f0;
q16[2] = q2[0] & 0x0f0f;
q16[3] = q2[0] & 0xf0f0;
float4 s = {0.f, 0.f, 0.f, 0.f};
float smin = 0;
for (int l = 0; l < 2; ++l) {
s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
}
tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
#endif
} }
#else #else
@ -1040,10 +1071,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
uint16_t aux[4]; uint16_t aux[4];
const uint8_t * sc = (const uint8_t *)aux; const uint8_t * sc = (const uint8_t *)aux;
uint16_t q16[8];
const uint8_t * q4 = (const uint8_t *)q16;
for (int i = ix; i < num_blocks_per_row; i += 2) { for (int i = ix; i < num_blocks_per_row; i += 2) {
const uint8_t * ql1 = x[i].qs + q_offset; const uint8_t * ql1 = x[i].qs + q_offset;
const uint8_t * ql2 = ql1 + 64;
const uint8_t * qh = x[i].qh + l0; const uint8_t * qh = x[i].qh + l0;
const float * y1 = yy + i*QK_K + y_offset; const float * y1 = yy + i*QK_K + y_offset;
const float * y2 = y1 + 128; const float * y2 = y1 + 128;
@ -1059,15 +1092,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
float4 sum = {0.f, 0.f, 0.f, 0.f}; float4 sum = {0.f, 0.f, 0.f, 0.f};
float smin = 0; float smin = 0;
const uint16_t * q1 = (const uint16_t *)ql1;
const uint16_t * q2 = q1 + 32;
q16[0] = q1[0] & 0x0f0f;
q16[1] = q1[8] & 0x0f0f;
q16[2] = (q1[0] >> 4) & 0x0f0f;
q16[3] = (q1[8] >> 4) & 0x0f0f;
q16[4] = q2[0] & 0x0f0f;
q16[5] = q2[8] & 0x0f0f;
q16[6] = (q2[0] >> 4) & 0x0f0f;
q16[7] = (q2[8] >> 4) & 0x0f0f;
for (int l = 0; l < n; ++l) { for (int l = 0; l < n; ++l) {
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0)); + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0)); + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0)); + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0)); + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
} }
@ -1521,7 +1564,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q4_K * bq4_K = (const block_q4_K *) vbq; const block_q4_K * bq4_K = (const block_q4_K *) vbq;
const int bq8_offset = QR4_K * (iqs / QI8_1); // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
float sumf_d = 0.0f; float sumf_d = 0.0f;
float sumf_m = 0.0f; float sumf_m = 0.0f;
@ -1529,22 +1573,44 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const float d = bq4_K->d; const float d = bq4_K->d;
const float dmin = bq4_K->dmin; const float dmin = bq4_K->dmin;
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]); // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
const int v1 = q4[0];
const int v2 = q4[4];
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
uint16_t aux[2];
const int j = bq8_offset/2;
if (j < 2) {
aux[0] = scales[j+0] & 0x3f3f;
aux[1] = scales[j+2] & 0x3f3f;
} else {
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
}
const uint8_t * sc = (const uint8_t *)aux;
const uint8_t * m = sc + 2;
for (int i = 0; i < QR4_K; ++i) { for (int i = 0; i < QR4_K; ++i) {
const int isc = bq8_offset + i;
uint8_t sc, m;
get_scale_min_k4(isc, bq4_K->scales, sc, m);
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
const float d8i = bq8i->d; const float d8i = bq8i->d;
const int * q8 = (const int *)bq8i->qs + (iqs%4);
const int ui1 = q8[0];
const int ui2 = q8[4];
const int vi = (v >> (4*i)) & 0x0F0F0F0F; const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
sumf_d += d8i * (dot1 * sc[i]);
sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
} }
return d*sumf_d - dmin*sumf_m; return d*sumf_d - dmin*sumf_m;
@ -1559,7 +1625,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
const block_q5_K * bq5_K = (const block_q5_K *) vbq; const block_q5_K * bq5_K = (const block_q5_K *) vbq;
const int bq8_offset = QR5_K * (iqs / QI8_1); const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
float sumf_d = 0.0f; float sumf_d = 0.0f;
float sumf_m = 0.0f; float sumf_m = 0.0f;
@ -1567,28 +1635,48 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
const float d = bq5_K->d; const float d = bq5_K->d;
const float dmin = bq5_K->dmin; const float dmin = bq5_K->dmin;
const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]); const int vl1 = ql[0];
const int vl2 = ql[4];
const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset; const int vh1 = qh[0] >> bq8_offset;
const int vh2 = qh[4] >> bq8_offset;
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
uint16_t aux[2];
const int j = bq8_offset/2;
if (j < 2) {
aux[0] = scales[j+0] & 0x3f3f;
aux[1] = scales[j+2] & 0x3f3f;
} else {
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
}
const uint8_t * sc = (const uint8_t *)aux;
const uint8_t * m = sc + 2;
for (int i = 0; i < QR5_K; ++i) { for (int i = 0; i < QR5_K; ++i) {
const int isc = bq8_offset + i;
uint8_t sc, m;
get_scale_min_k4(isc, bq5_K->scales, sc, m);
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
const float d8i = bq8i->d; const float d8i = bq8i->d;
const int * q8 = (const int *)bq8i->qs + (iqs%4);
const int ui1 = q8[0];
const int ui2 = q8[4];
const int vil = (vl >> (4*i)) & 0x0F0F0F0F; const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
const int vih = ((vh >> i) << 4) & 0x10101010; const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
const int vi = vil | vih; const int vi1 = vil1 | vih1;
const int vi2 = vil2 | vih2;
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
sumf_d += d8i * (dot1 * sc[i]);
sumf_m += d8i * (dot2 * m[i]);
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
} }
return d*sumf_d - dmin*sumf_m; return d*sumf_d - dmin*sumf_m;
@ -1745,11 +1833,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
} }
} }
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) { static __global__ void mul_mat_p021_f16_f32(
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
const half * x = (const half *) vx; const half * x = (const half *) vx;
const int row_x = blockDim.y*blockIdx.y + threadIdx.y; const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int channel = blockDim.z*blockIdx.z + threadIdx.z; const int channel = blockDim.z*blockIdx.z + threadIdx.z;
const int channel_x = channel / (nchannels_y / nchannels_x);
const int nrows_y = ncols_x; const int nrows_y = ncols_x;
const int nrows_dst = nrows_x; const int nrows_dst = nrows_x;
@ -1765,7 +1857,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
} }
// x is transposed and permuted // x is transposed and permuted
const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x; const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
const float xi = __half2float(x[ix]); const float xi = __half2float(x[ix]);
const int row_y = col_x; const int row_y = col_x;
@ -1793,12 +1885,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
const int row_stride_x, const int channel_stride_x) { const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
const half * x = (const half *) vx; const half * x = (const half *) vx;
const int row_x = blockDim.y*blockIdx.y + threadIdx.y; const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int channel = blockDim.z*blockIdx.z + threadIdx.z; const int channel = blockDim.z*blockIdx.z + threadIdx.z;
const int channel_x = channel / channel_x_divisor;
const int nrows_y = ncols_x; const int nrows_y = ncols_x;
const int nrows_dst = nrows_x; const int nrows_dst = nrows_x;
@ -1815,7 +1908,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
break; break;
} }
const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x; const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
const float xi = __half2float(x[ix]); const float xi = __half2float(x[ix]);
const int row_y = col_x; const int row_y = col_x;
@ -2027,10 +2120,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols); norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
} }
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0); GGML_ASSERT(ncols % WARP_SIZE == 0);
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols); rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
} }
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) { static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
@ -2259,7 +2352,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1> // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
// is better amortized.
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
} }
@ -2268,7 +2364,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(1, block_num_y, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1> // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
// is better amortized.
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
} }
@ -2324,20 +2423,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
} }
} }
static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) { static void ggml_mul_mat_p021_f16_f32_cuda(
const dim3 block_nums(1, nrows_x, nchannels_x); const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
const dim3 block_nums(1, nrows_x, nchannels_y);
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x); mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
} }
static void ggml_mul_mat_vec_nc_f16_f32_cuda( static void ggml_mul_mat_vec_nc_f16_f32_cuda(
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x, const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
const int nchannels_x, const int channel_stride_x, cudaStream_t stream) { const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
const dim3 block_nums(1, nrows_x, nchannels_x); const dim3 block_nums(1, nrows_x, nchannels_y);
const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_dims(WARP_SIZE, 1, 1);
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>> mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x); (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
} }
static void ggml_cpy_f32_f32_cuda( static void ggml_cpy_f32_f32_cuda(
@ -2423,10 +2525,26 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock); scoped_spin_lock lock(g_cuda_pool_lock);
int id; int id;
CUDA_CHECK(cudaGetDevice(&id)); CUDA_CHECK(cudaGetDevice(&id));
#ifdef DEBUG_CUDA_MALLOC
int nnz = 0;
size_t max_size = 0, tot_size = 0;
#endif
size_t best_diff = 1ull << 36;
int ibest = -1;
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
cuda_buffer& b = g_cuda_buffer_pool[id][i]; cuda_buffer& b = g_cuda_buffer_pool[id][i];
if (b.size >= size && b.ptr != nullptr) { if (b.ptr != nullptr) {
#ifdef DEBUG_CUDA_MALLOC
++nnz;
tot_size += b.size;
if (b.size > max_size) max_size = b.size;
#endif
if (b.size >= size) {
size_t diff = b.size - size;
if (diff < best_diff) {
best_diff = diff;
ibest = i;
if (!best_diff) {
void * ptr = b.ptr; void * ptr = b.ptr;
*actual_size = b.size; *actual_size = b.size;
b.ptr = nullptr; b.ptr = nullptr;
@ -2434,9 +2552,26 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr; return ptr;
} }
} }
}
}
}
if (ibest >= 0) {
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
#ifdef DEBUG_CUDA_MALLOC
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
#endif
void * ptr; void * ptr;
CUDA_CHECK(cudaMalloc((void **) &ptr, size)); size_t look_ahead_size = (size_t) (1.05 * size);
*actual_size = size; look_ahead_size = 256 * ((look_ahead_size + 255)/256);
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
*actual_size = look_ahead_size;
return ptr; return ptr;
} }
@ -2464,7 +2599,9 @@ static size_t g_scratch_offset = 0;
static int g_device_count = -1; static int g_device_count = -1;
static int g_main_device = 0; static int g_main_device = 0;
#ifndef GGML_CUDA_FORCE_DMMV
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
#endif
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
@ -2487,7 +2624,9 @@ void ggml_init_cublas() {
g_tensor_split[id] = total_vram; g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem; total_vram += prop.totalGlobalMem;
#ifndef GGML_CUDA_FORCE_DMMV
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
#endif
} }
for (int id = 0; id < g_device_count; ++id) { for (int id = 0; id < g_device_count; ++id) {
g_tensor_split[id] /= total_vram; g_tensor_split[id] /= total_vram;
@ -2512,6 +2651,9 @@ void ggml_init_cublas() {
} }
void ggml_cuda_set_tensor_split(const float * tensor_split) { void ggml_cuda_set_tensor_split(const float * tensor_split) {
if (tensor_split == nullptr) {
return;
}
bool all_zero = true; bool all_zero = true;
for (int i = 0; i < g_device_count; ++i) { for (int i = 0; i < g_device_count; ++i) {
if (tensor_split[i] != 0.0f) { if (tensor_split[i] != 0.0f) {
@ -2652,6 +2794,7 @@ inline void ggml_cuda_op_mul(
(void) dst; (void) dst;
(void) src0_ddq_i; (void) src0_ddq_i;
(void) i02; (void) i02;
(void) i1;
} }
inline void ggml_cuda_op_gelu( inline void ggml_cuda_op_gelu(
@ -2731,8 +2874,11 @@ inline void ggml_cuda_op_rms_norm(
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low; const int64_t i01_diff = i01_high - i01_low;
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
// compute // compute
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main); rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
(void) src1; (void) src1;
(void) dst; (void) dst;
@ -2779,8 +2925,8 @@ inline void ggml_cuda_op_mul_mat_vec(
#endif #endif
if (use_mul_mat_vec_q) { if (use_mul_mat_vec_q) {
int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1; const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING; ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
size_t as; size_t as;
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main); quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
@ -2947,13 +3093,18 @@ inline void ggml_cuda_op_rope(
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t i01_diff = i01_high - i01_low; const int64_t i01_diff = i01_high - i01_low;
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) src1->data)[1]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) src1->data)[2]; const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) src1->data)[3]; const int n_ctx = ((int32_t *) dst->op_params)[3];
// RoPE alteration for extended context
const float theta_scale = powf(10000.0, -2.0f/n_dims); float freq_base, freq_scale;
const float p = ((mode & 1) == 0 ? n_past + i02 : i02); memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
bool is_glm = mode & 4; bool is_glm = mode & 4;
@ -2966,6 +3117,7 @@ inline void ggml_cuda_op_rope(
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
} }
(void) src1;
(void) dst; (void) dst;
(void) src0_ddq_i; (void) src0_ddq_i;
(void) src1_ddf_i; (void) src1_ddf_i;
@ -2984,11 +3136,12 @@ inline void ggml_cuda_op_diag_mask_inf(
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t i01_diff = i01_high - i01_low; const int64_t i01_diff = i01_high - i01_low;
const int n_past = ((int32_t *) src1->data)[0]; const int n_past = ((int32_t *) dst->op_params)[0];
// compute // compute
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main); diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
(void) src1;
(void) dst; (void) dst;
(void) src0_ddq_i; (void) src0_ddq_i;
(void) src1_ddf_i; (void) src1_ddf_i;
@ -3056,6 +3209,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
const int64_t ne11 = use_src1 ? src1->ne[1] : 1; const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
const int64_t ne12 = use_src1 ? src1->ne[2] : 1; const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
const int64_t ne13 = use_src1 ? src1->ne[3] : 1; const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
GGML_ASSERT(ne03 == ne13);
const int64_t ne0 = dst->ne[0]; const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1]; const int64_t ne1 = dst->ne[1];
@ -3067,12 +3223,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
// strides for iteration over dims 3 and 2 // strides for iteration over dims 3 and 2
const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03; const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1; const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
const int64_t src0_stride = ne00 * ne01 * stride_mod; const int64_t src0_stride = ne00 * ne01 * stride_mod;
const int64_t src1_stride = ne10 * ne11 * stride_mod; const int64_t src1_stride = ne10 * ne11 * stride_mod;
const int64_t dst_stride = ne0 * ne1 * stride_mod; const int64_t dst_stride = ne0 * ne1 * stride_mod;
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
const int64_t i03_max = flatten_rows ? 1 : ne03;
const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
GGML_ASSERT(!(flatten_rows && ne02 < ne12));
const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_ts = ggml_type_size(src0->type);
const size_t src0_bs = ggml_blck_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type);
@ -3089,6 +3252,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE); dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
GGML_ASSERT(!(split && ne02 < ne12));
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
@ -3125,7 +3289,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1]; row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
} else { } else {
row_low = 0; row_low = 0;
row_high = nrows0; row_high = nrows0*i02_divisor;
} }
if (row_low == row_high) { if (row_low == row_high) {
continue; continue;
@ -3173,16 +3337,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]); dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
} }
const int64_t i03_max = flatten_rows ? 1 : ne03;
const int64_t i02_max = flatten_rows ? 1 : ne02;
const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
for (int64_t i03 = 0; i03 < i03_max; i03++) { for (int64_t i03 = 0; i03 < i03_max; i03++) {
const int64_t i13 = i03 % ne13; const int64_t i13 = i03 % ne13;
for (int64_t i02 = 0; i02 < i02_max; i02++) { for (int64_t i02 = 0; i02 < i02_max; i02++) {
const int64_t i12 = i02 % ne12; const int64_t i12 = i02 % ne12;
const int64_t i0 = i03*ne02 + i02; const int64_t i0 = i03*i02_max + i02;
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
const int64_t i0_offset_low = row_low/rows_per_iter; const int64_t i0_offset_low = row_low/rows_per_iter;
@ -3216,8 +3376,8 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
const int64_t i11 = i13*ne12 + i12; const int64_t i11 = i13*ne12 + i12;
// for split tensors the data begins at i0 == i0_offset_low // for split tensors the data begins at i0 == i0_offset_low
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs; char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride; float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride; float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
@ -3258,11 +3418,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
} }
} }
if (!src0_on_device || !src0_is_contiguous) { if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
if (src0_is_f32) { if (src0_is_f32) {
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
} else { } else {
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
} }
} }
@ -3416,6 +3576,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2]; const int64_t ne02 = src0->ne[2];
const int64_t ne12 = src1->ne[2];
CUDA_CHECK(cudaSetDevice(g_main_device)); CUDA_CHECK(cudaSetDevice(g_main_device));
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
@ -3428,7 +3590,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main); ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
} }
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@ -3442,6 +3604,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
const int64_t ne01 = src0->ne[1]; const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2]; const int64_t ne02 = src0->ne[2];
const int64_t ne12 = src1->ne[2];
const int64_t nb01 = src0->nb[1]; const int64_t nb01 = src0->nb[1];
const int64_t nb02 = src0->nb[2]; const int64_t nb02 = src0->nb[2];
@ -3460,7 +3624,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
const int row_stride_x = nb01 / sizeof(half); const int row_stride_x = nb01 / sizeof(half);
const int channel_stride_x = nb02 / sizeof(half); const int channel_stride_x = nb02 / sizeof(half);
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main); ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
} }
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -3601,7 +3765,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
size_t size = ggml_nbytes_split(tensor, nrows_split); size_t size = ggml_nbytes_split(tensor, nrows_split);
const size_t original_size = size; const size_t original_size = size;
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
if (ne0 % MATRIX_ROW_PADDING != 0) { if (ne0 % MATRIX_ROW_PADDING != 0) {
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
@ -3617,7 +3781,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
} }
CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
extra->data_device[id] = buf; extra->data_device[id] = buf;
@ -3697,7 +3861,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t offset = 0; size_t offset = 0;
if (tensor->op == GGML_OP_VIEW) { if (tensor->op == GGML_OP_VIEW) {
memcpy(&offset, tensor->src[2]->data, sizeof(size_t)); memcpy(&offset, tensor->op_params, sizeof(size_t));
} }
extra = ggml_cuda_alloc_temp_tensor_extra(); extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = src0_ddc + offset; extra->data_device[g_main_device] = src0_ddc + offset;
@ -3799,18 +3963,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
} }
func = ggml_cuda_mul; func = ggml_cuda_mul;
break; break;
case GGML_OP_GELU: case GGML_OP_UNARY:
switch (ggml_get_unary_op(tensor)) {
case GGML_UNARY_OP_GELU:
if (!any_on_device) { if (!any_on_device) {
return false; return false;
} }
func = ggml_cuda_gelu; func = ggml_cuda_gelu;
break; break;
case GGML_OP_SILU: case GGML_UNARY_OP_SILU:
if (!any_on_device) { if (!any_on_device) {
return false; return false;
} }
func = ggml_cuda_silu; func = ggml_cuda_silu;
break; break;
default:
return false;
} break;
case GGML_OP_NORM: case GGML_OP_NORM:
if (!any_on_device) { if (!any_on_device) {
return false; return false;

View file

@ -42,6 +42,7 @@ struct ggml_metal_context {
id<MTLComputePipelineState> pipeline_##name id<MTLComputePipelineState> pipeline_##name
GGML_METAL_DECL_KERNEL(add); GGML_METAL_DECL_KERNEL(add);
GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast
GGML_METAL_DECL_KERNEL(mul); GGML_METAL_DECL_KERNEL(mul);
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
GGML_METAL_DECL_KERNEL(scale); GGML_METAL_DECL_KERNEL(scale);
@ -157,6 +158,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
GGML_METAL_ADD_KERNEL(add); GGML_METAL_ADD_KERNEL(add);
GGML_METAL_ADD_KERNEL(add_row);
GGML_METAL_ADD_KERNEL(mul); GGML_METAL_ADD_KERNEL(mul);
GGML_METAL_ADD_KERNEL(mul_row); GGML_METAL_ADD_KERNEL(mul_row);
GGML_METAL_ADD_KERNEL(scale); GGML_METAL_ADD_KERNEL(scale);
@ -464,10 +466,16 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
} }
if (ggml_nelements(src1) == ne10) {
// src1 is a row
[encoder setComputePipelineState:ctx->pipeline_add_row];
} else {
[encoder setComputePipelineState:ctx->pipeline_add]; [encoder setComputePipelineState:ctx->pipeline_add];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
const int64_t n = ggml_nelements(dst); const int64_t n = ggml_nelements(dst);
@ -511,7 +519,9 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_SILU: case GGML_OP_UNARY:
switch (ggml_get_unary_op(gf->nodes[i])) {
case GGML_UNARY_OP_SILU:
{ {
if (encoder == nil) { if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
@ -525,7 +535,7 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_RELU: case GGML_UNARY_OP_RELU:
{ {
if (encoder == nil) { if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
@ -539,7 +549,7 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_GELU: case GGML_UNARY_OP_GELU:
{ {
if (encoder == nil) { if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
@ -553,6 +563,12 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
default:
{
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false);
}
} break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
if (encoder == nil) { if (encoder == nil) {
@ -577,7 +593,7 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
} }
const int n_past = ((int32_t *)(src1->data))[0]; const int n_past = ((int32_t *)(dst->op_params))[0];
[encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -676,8 +692,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
} break; } break;
case GGML_TYPE_Q3_K: case GGML_TYPE_Q3_K:
@ -685,8 +701,8 @@ void ggml_metal_graph_compute(
GGML_ASSERT(ne02 == 1); GGML_ASSERT(ne02 == 1);
GGML_ASSERT(ne12 == 1); GGML_ASSERT(ne12 == 1);
nth0 = 4; nth0 = 2;
nth1 = 16; nth1 = 32;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
} break; } break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
@ -740,19 +756,21 @@ void ggml_metal_graph_compute(
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif
}
else if (src0t == GGML_TYPE_Q5_K) { else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
}
else if (src0t == GGML_TYPE_Q2_K ||
src0t == GGML_TYPE_Q3_K) {
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -794,7 +812,8 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
} }
const float eps = 1e-6f; float eps;
memcpy(&eps, dst->op_params, sizeof(float));
const int nth = 512; const int nth = 512;
@ -840,9 +859,10 @@ void ggml_metal_graph_compute(
GGML_ASSERT((src0t == GGML_TYPE_F32)); GGML_ASSERT((src0t == GGML_TYPE_F32));
const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past); const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
const int n_head = ((int32_t *) src1->data)[1]; const int n_head = ((int32_t *) dst->op_params)[1];
const float max_bias = ((float *) src1->data)[2]; float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
if (__builtin_popcount(n_head) != 1) { if (__builtin_popcount(n_head) != 1) {
GGML_ASSERT(false && "only power-of-two n_head implemented"); GGML_ASSERT(false && "only power-of-two n_head implemented");
@ -880,15 +900,14 @@ void ggml_metal_graph_compute(
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
} }
const int n_dims = ((int32_t *) src1->data)[1]; const int n_past = ((int32_t *) dst->op_params)[0];
const int mode = ((int32_t *) src1->data)[2]; const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
const int n_past = ((int32_t *)(src1->data))[0];
float freq_base; float freq_base;
float freq_scale; float freq_scale;
memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
[encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setComputePipelineState:ctx->pipeline_rope];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -917,7 +936,9 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break; } break;
case GGML_OP_DUP:
case GGML_OP_CPY: case GGML_OP_CPY:
case GGML_OP_CONT:
{ {
if (encoder == nil) { if (encoder == nil) {
encoder = [command_buffer computeCommandEncoder]; encoder = [command_buffer computeCommandEncoder];
@ -967,10 +988,12 @@ void ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
default: default:
{
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false); GGML_ASSERT(false);
} }
} }
}
if (encoder != nil) { if (encoder != nil) {
[encoder endEncoding]; [encoder endEncoding];

View file

@ -67,6 +67,17 @@ kernel void kernel_add(
dst[tpig] = src0[tpig] + src1[tpig]; dst[tpig] = src0[tpig] + src1[tpig];
} }
// assumption: src1 is a row
// broadcast src1 into src0
kernel void kernel_add_row(
device const float * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] + src1[tpig % ne00];
}
kernel void kernel_mul( kernel void kernel_mul(
device const float * src0, device const float * src0,
device const float * src1, device const float * src1,
@ -351,7 +362,7 @@ kernel void kernel_rms_norm(
threadgroup_barrier(mem_flags::mem_threadgroup); threadgroup_barrier(mem_flags::mem_threadgroup);
// broadcast, simd group number is ntg / 32 // broadcast, simd group number is ntg / 32
for (int i = ntg / 32 / 2; i > 0; i /= 2) { for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
if (tpitg < i) { if (tpitg < i) {
sum[tpitg] += sum[tpitg + i]; sum[tpitg] += sum[tpitg + i];
} }
@ -1209,111 +1220,137 @@ kernel void kernel_mul_mat_q2_K_f32(
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne10, constant int64_t & ne10,
constant int64_t & ne0, constant int64_t & ne0,
threadgroup float * sum [[threadgroup(0)]], constant int64_t & ne01[[buffer(4)]],
uint2 tgpig[[threadgroup_position_in_grid]], uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tpitg[[thread_position_in_threadgroup]], uint tiisg[[thread_index_in_simdgroup]],
uint2 tptg[[threads_per_threadgroup]]) { uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K; const int nb = ne00/QK_K;
const int r0 = tgpig.x;
const int r1 = tgpig.y;
const int64_t r0 = tgpig.x; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
const int64_t r1 = tgpig.y; const int ib_row = first_row * nb;
device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row;
device const float * y = (device const float *) src1 + r1*ne10;
float yl[32];
float sumf[N_DST]={0.f}, all_sum;
device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb; const int step = sizeof(block_q2_K) * nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y;
const int ith = tptg.y*tpitg.x + tpitg.y;
float sumf = 0;
#if QK_K == 256 #if QK_K == 256
const int tid = tpitg.y; // 0...16 const int ix = tiisg/8; // 0...3
const int il = tid/4; // 0...3 const int it = tiisg%8; // 0...7
const int ir = tid%4; // 0...3 const int im = it/4; // 0 or 1
const int ip = il/2; // 0 or 1 const int ir = it%4; // 0...3
const int shift1 = 4*(il%2);// 0 or 4 const int is = (8*ir)/16;// 0 or 1
const int shift2 = shift1+2;// 2 or 6
const int n = 8;
const int is = 4*il + (n*ir)/16;
const int y_offset = 64*il + n*ir; device const float * y4 = y + ix * QK_K + 128 * im + 8 * ir;
const int q_offset = 32*ip + n*ir;
for (int i = tpitg.x; i < nb; i += tptg.x) { for (int ib = ix; ib < nb; ib += 4) {
device const uint8_t * q = x[i].qs + q_offset; float4 sumy = {0.f, 0.f, 0.f, 0.f};
device const uint8_t * scales = x[i].scales + is; for (int i = 0; i < 8; ++i) {
yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
uint8_t d1 = scales[0] & 0xF; yl[i+ 8] = y4[i+32]; sumy[1] += yl[i+ 8];
uint8_t d2 = scales[2] & 0xF; yl[i+16] = y4[i+64]; sumy[2] += yl[i+16];
uint8_t m1 = scales[0] >> 4; yl[i+24] = y4[i+96]; sumy[3] += yl[i+24];
uint8_t m2 = scales[2] >> 4;
device const float * y = yy + i*QK_K + y_offset;
float2 s = {0.f, 0.f};
float smin = 0;
for (int l = 0; l < n; ++l) {
s[0] += y[l+ 0] * ((q[l] >> shift1) & 3);
s[1] += y[l+32] * ((q[l] >> shift2) & 3);
smin += y[l+ 0] * m1 + y[l+32] * m2;
} }
const float dall = (float)x[i].d; device const uint8_t * sc = (device const uint8_t *)x[ib].scales + 8*im + is;
const float dmin = (float)x[i].dmin; device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
device const half * dh = &x[ib].d;
sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin; for (int row = 0; row < N_DST; row++) {
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
}
float dall = dh[0];
float dmin = dh[1] * 1.f/16.f;
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
(acc1[1] + 1.f/256.f * acc2[1]) * (sc[2] & 0xF) * 1.f/ 4.f +
(acc1[2] + 1.f/256.f * acc2[2]) * (sc[4] & 0xF) * 1.f/16.f +
(acc1[3] + 1.f/256.f * acc2[3]) * (sc[6] & 0xF) * 1.f/64.f) -
dmin * (sumy[0] * (sc[0] & 0xF0) + sumy[1] * (sc[2] & 0xF0) + sumy[2] * (sc[4] & 0xF0) + sumy[3] * (sc[6] & 0xF0));
qs += step/2;
sc += step;
dh += step/2;
}
y4 += 4 * QK_K;
} }
#else #else
const int il = 4 * tpitg.x; const int ix = tiisg/2; // 0...15
const int it = tiisg%2; // 0...1
uint32_t aux[2]; device const float * y4 = y + ix * QK_K + 8 * it;
thread const uint8_t * d = (thread const uint8_t *)aux;
thread const uint8_t * m = (thread const uint8_t *)aux + 4;
for (int i = tpitg.y; i < nb; i += tptg.y) { for (int ib = ix; ib < nb; ib += 16) {
device const uint8_t * q = x[i].qs + il; float4 sumy = {0.f, 0.f, 0.f, 0.f};
device const float * y = yy + i*QK_K + il; for (int i = 0; i < 8; ++i) {
yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
const float dall = (float)x[i].d; yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
const float dmin = (float)x[i].dmin; yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
device const uint32_t * a = (device const uint32_t *)x[i].scales;
aux[0] = a[0] & 0x0f0f0f0f;
aux[1] = (a[0] >> 4) & 0x0f0f0f0f;
for (int l = 0; l < 4; ++l) {
sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0])
+ y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1])
+ y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2])
+ y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]);
} }
device const uint8_t * sc = (device const uint8_t *)x[ib].scales;
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
device const half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) {
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
}
float dall = dh[0];
float dmin = dh[1];
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
(acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
(acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
(acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
qs += step/2;
sc += step;
dh += step/2;
}
y4 += 16 * QK_K;
} }
#endif #endif
sum[ith] = sumf; for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
// if (tiisg == 0) {
// Accumulate the sum from all threads in the threadgroup dst[r1*ne0 + first_row + row] = all_sum;
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
} }
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
} }
} }
#if QK_K == 256
kernel void kernel_mul_mat_q3_K_f32( kernel void kernel_mul_mat_q3_K_f32(
device const void * src0, device const void * src0,
device const float * src1, device const float * src1,
@ -1322,40 +1359,41 @@ kernel void kernel_mul_mat_q3_K_f32(
constant int64_t & ne10, constant int64_t & ne10,
constant int64_t & ne0, constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
threadgroup float * sum [[threadgroup(0)]],
uint2 tgpig[[threadgroup_position_in_grid]], uint2 tgpig[[threadgroup_position_in_grid]],
uint2 tpitg[[thread_position_in_threadgroup]], uint tiisg[[thread_index_in_simdgroup]],
uint2 tptg[[threads_per_threadgroup]]) { uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K; const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x; const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb;
device const float * yy = (device const float *) src1 + r1*ne10; device const float * yy = (device const float *) src1 + r1*ne10;
const int nth = tptg.x*tptg.y; float yl[16];
const int ith = tptg.y*tpitg.x + tpitg.y;
#if QK_K == 256
const uint8_t m3 = 3;
const int8_t m4 = 4;
const uint16_t kmask1 = 0x0303; const uint16_t kmask1 = 0x0303;
const uint16_t kmask2 = 0x0f0f; const uint16_t kmask2 = 0x0f0f;
const int tid = tpitg.y; // expecting 16 const int tid = tiisg/2;
const int ix = tiisg%2;
const int ip = tid/8; // 0 or 1 const int ip = tid/8; // 0 or 1
const int il = tid/2 - 4*ip; // 0...3 const int il = tid/2 - 4*ip; // 0...3
const int ir = tid%2; const int ir = tid%2;
const int n = 8; const int n = 8;
const int l0 = n*ir; const int l0 = n*ir;
const uint8_t m = 1 << (4*ip + il); const uint16_t m1 = 1 << (4*ip + il);
const uint16_t m2 = m1 << 8;
const int shift = 2*il; const int shift = 2*il;
const uint16_t qm1 = 0x0003 << shift;
const uint16_t qm2 = 0x0300 << shift;
const int32_t v1 = 4 << shift;
const int32_t v2 = 1024 << shift;
const uint16_t s_shift1 = 4*ip; const uint16_t s_shift1 = 4*ip;
const uint16_t s_shift2 = s_shift1 + 2*(il/2); const uint16_t s_shift2 = s_shift1 + 2*(il/2);
@ -1364,94 +1402,133 @@ kernel void kernel_mul_mat_q3_K_f32(
const int q_offset = 32*ip + l0; const int q_offset = 32*ip + l0;
const int y_offset = 128*ip + 32*il + l0; const int y_offset = 128*ip + 32*il + l0;
//float sumf = 0; const int step = sizeof(block_q3_K) * nb / 2;
float sumf1 = 0, sumf2 = 0;
for (int i = tpitg.x; i < nb; i += tptg.x) {
const float d_all = (float)(x[i].d); device const float * y1 = yy + ix*QK_K + y_offset;
device const uint8_t * q = x[i].qs + q_offset; float sumf1[2] = {0.f}, sumf2[2] = {0.f};
device const uint8_t * h = x[i].hmask + l0; for (int i = ix; i < nb; i += 2) {
device const float * y = yy + i * QK_K + y_offset;
device const uint16_t * a = (device const uint16_t *)x[i].scales; for (int l = 0; l < 8; ++l) {
yl[l+0] = y1[l+ 0];
yl[l+8] = y1[l+16];
}
device const uint16_t * q = (device const uint16_t *)(x[i].qs + q_offset);
device const uint16_t * h = (device const uint16_t *)(x[i].hmask + l0);
device const uint16_t * a = (device const uint16_t *)(x[i].scales);
device const half * dh = &x[i].d;
for (int row = 0; row < 2; ++row) {
const float d_all = (float)dh[0];
const char2 scales = as_type<char2>((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4))); const char2 scales = as_type<char2>((uint16_t)(((a[il] >> s_shift1) & kmask2) | (((a[ik] >> s_shift2) & kmask1) << 4)));
float s = 0; float s1 = 0, s2 = 0;
for (int l = 0; l < n; ++l) { for (int l = 0; l < n; l += 2) {
s += y[l+ 0] * ((int8_t)((q[l+ 0] >> shift) & m3) - ((h[l+ 0] & m) ? 0 : m4)); const uint16_t qs = q[l/2];
s1 += yl[l+0] * ((int32_t)(qs & qm1) - ((h[l/2] & m1) ? 0 : v1));
s2 += yl[l+1] * ((int32_t)(qs & qm2) - ((h[l/2] & m2) ? 0 : v2));
} }
float d = d_all * s; float d = d_all * (s1 + 1.f/256.f * s2);
sumf1 += d * scales[0]; sumf1[row] += d * scales[0];
sumf2 += d; sumf2[row] += d;
//sumf += d_all * s * (scales[0] - 32);
s = 0; s1 = s2 = 0;
for (int l = 0; l < n; ++l) { for (int l = 0; l < n; l += 2) {
s += y[l+16] * ((int8_t)((q[l+16] >> shift) & m3) - ((h[l+16] & m) ? 0 : m4)); const uint16_t qs = q[l/2+8];
s1 += yl[l+8] * ((int32_t)(qs & qm1) - ((h[l/2+8] & m1) ? 0 : v1));
s2 += yl[l+9] * ((int32_t)(qs & qm2) - ((h[l/2+8] & m2) ? 0 : v2));
} }
d = d_all * s; d = d_all * (s1 + 1.f/256.f * s2);
sumf1 += d * scales[1]; sumf1[row] += d * scales[1];
sumf2 += d; sumf2[row] += d;
//sumf += d_all * s * (scales[1] - 32);
q += step;
h += step;
a += step;
dh += step;
} }
//sum[ith] = sumf; y1 += 2 * QK_K;
sum[ith] = sumf1 - 32.f*sumf2;
}
for (int row = 0; row < 2; ++row) {
const float sumf = (sumf1[row] - 32.f*sumf2[row]) / (1 << shift);
const float tot = simd_sum(sumf);
if (tiisg == 0) {
dst[r1*ne0 + first_row + row] = tot;
}
}
}
#else #else
const int il = 4 * tpitg.x; // 0, 4, 8, 12 kernel void kernel_mul_mat_q3_K_f32(
device const void * src0,
device const float * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne10,
constant int64_t & ne0,
constant int64_t & ne1,
uint2 tgpig[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int nb = ne00/QK_K;
const int64_t r0 = tgpig.x;
const int64_t r1 = tgpig.y;
const int row = 2 * r0 + sgitg;
device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb;
device const float * yy = (device const float *) src1 + r1*ne10;
const int ix = tiisg/4;
const int il = 4 * (tiisg%4);// 0, 4, 8, 12
const int im = il/8; // 0, 0, 1, 1 const int im = il/8; // 0, 0, 1, 1
const int in = il%8; // 0, 4, 0, 4 const int in = il%8; // 0, 4, 0, 4
float sumf = 0; float2 sum = {0.f, 0.f};
for (int i = tpitg.y; i < nb; i += tptg.y) { for (int i = ix; i < nb; i += 8) {
const float d_all = (float)(x[i].d); const float d_all = (float)(x[i].d);
device const uint8_t * q = x[i].qs + il; device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
device const uint8_t * h = x[i].hmask + in; device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
device const uint16_t * s = (device const uint16_t *)(x[i].scales);
device const float * y = yy + i * QK_K + il; device const float * y = yy + i * QK_K + il;
const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; l += 2) {
const uint8_t hm = h[l] >> im; const uint16_t hm = h[l/2] >> im;
sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4)) sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4))
+ y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4)) + y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
+ y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4)) + y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
+ y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4)); + y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
+ y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
+ y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
+ y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
} }
} }
const float sumf = sum[0] + sum[1] * 1.f/256.f;
sum[ith] = sumf; const float tot = simd_sum(sumf);
if (tiisg == 0) {
dst[r1*ne0 + row] = tot;
}
}
#endif #endif
//
// Accumulate the sum from all threads in the threadgroup
//
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%4 == 0) {
for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith%16 == 0) {
for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (ith == 0) {
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
dst[r1*ne0 + r0] = sum[0];
}
}
#if QK_K == 256 #if QK_K == 256
kernel void kernel_mul_mat_q4_K_f32( kernel void kernel_mul_mat_q4_K_f32(
device const void * src0, device const void * src0,
@ -1748,7 +1825,6 @@ kernel void kernel_mul_mat_q5_K_f32(
for (int i = ix; i < nb; i += 8) { for (int i = ix; i < nb; i += 8) {
float4 sumy = {0.f, 0.f, 0.f, 0.f};
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; ++l) {
yl[l+0] = y[l+ 0]; yl[l+0] = y[l+ 0];
yl[l+4] = y[l+16]; yl[l+4] = y[l+16];

1397
ggml.c

File diff suppressed because it is too large Load diff

72
ggml.h
View file

@ -199,6 +199,7 @@
#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 6 #define GGML_MAX_SRC 6
#define GGML_MAX_NAME 48 #define GGML_MAX_NAME 48
#define GGML_MAX_OP_PARAMS 32
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
@ -329,16 +330,6 @@ extern "C" {
GGML_OP_ARGMAX, GGML_OP_ARGMAX,
GGML_OP_REPEAT, GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK, GGML_OP_REPEAT_BACK,
GGML_OP_ABS,
GGML_OP_SGN,
GGML_OP_NEG,
GGML_OP_STEP,
GGML_OP_TANH,
GGML_OP_ELU,
GGML_OP_RELU,
GGML_OP_GELU,
GGML_OP_GELU_QUICK,
GGML_OP_SILU,
GGML_OP_SILU_BACK, GGML_OP_SILU_BACK,
GGML_OP_NORM, // normalize GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM, GGML_OP_RMS_NORM,
@ -377,6 +368,8 @@ extern "C" {
GGML_OP_WIN_PART, GGML_OP_WIN_PART,
GGML_OP_WIN_UNPART, GGML_OP_WIN_UNPART,
GGML_OP_UNARY,
GGML_OP_MAP_UNARY, GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY, GGML_OP_MAP_BINARY,
@ -390,6 +383,18 @@ extern "C" {
GGML_OP_COUNT, GGML_OP_COUNT,
}; };
enum ggml_unary_op {
GGML_UNARY_OP_ABS,
GGML_UNARY_OP_SGN,
GGML_UNARY_OP_NEG,
GGML_UNARY_OP_STEP,
GGML_UNARY_OP_TANH,
GGML_UNARY_OP_ELU,
GGML_UNARY_OP_RELU,
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_GELU_QUICK,
GGML_UNARY_OP_SILU,
};
// ggml object // ggml object
struct ggml_object { struct ggml_object {
@ -418,6 +423,9 @@ extern "C" {
// compute data // compute data
enum ggml_op op; enum ggml_op op;
// op params - allocated as int32_t for alignment
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
bool is_param; bool is_param;
struct ggml_tensor * grad; struct ggml_tensor * grad;
@ -531,6 +539,7 @@ extern "C" {
GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op); GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API const char * ggml_op_symbol(enum ggml_op op);
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
@ -554,6 +563,7 @@ extern "C" {
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
@ -613,6 +623,8 @@ extern "C" {
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
@ -625,6 +637,11 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_dup_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_add( GGML_API struct ggml_tensor * ggml_add(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -849,14 +866,17 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_rms_norm( GGML_API struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a,
float eps);
GGML_API struct ggml_tensor * ggml_rms_norm_inplace( GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a,
float eps);
// a - x // a - x
// b - dy // b - dy
// TODO: update with configurable eps
GGML_API struct ggml_tensor * ggml_rms_norm_back( GGML_API struct ggml_tensor * ggml_rms_norm_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -948,11 +968,22 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// a -> b, in-place, return view(b)
GGML_API struct ggml_tensor * ggml_cpy_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// make contiguous // make contiguous
GGML_API struct ggml_tensor * ggml_cont( GGML_API struct ggml_tensor * ggml_cont(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// make contiguous, in-place
GGML_API struct ggml_tensor * ggml_cont_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return view(a), b specifies the new shape // return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
GGML_API struct ggml_tensor * ggml_reshape( GGML_API struct ggml_tensor * ggml_reshape(
@ -1128,9 +1159,9 @@ extern "C" {
int n_past, int n_past,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx,
float freq_base, float freq_base,
float freq_scale, float freq_scale);
int n_ctx);
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
// a - dy // a - dy
@ -1139,7 +1170,8 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_dims, int n_dims,
int mode); int mode,
int n_ctx);
// alibi position embedding // alibi position embedding
// in-place, returns view(a) // in-place, returns view(a)
@ -1263,6 +1295,16 @@ extern "C" {
typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *); typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
GGML_API struct ggml_tensor * ggml_unary(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op);
GGML_API struct ggml_tensor * ggml_unary_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
enum ggml_unary_op op);
GGML_API struct ggml_tensor * ggml_map_unary_f32( GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,

6
grammars/arithmetic.gbnf Normal file
View file

@ -0,0 +1,6 @@
root ::= (expr "=" ws term "\n")+
expr ::= term ([-+*/] term)*
term ::= ident | num | "(" ws expr ")" ws
ident ::= [a-z] [a-z0-9_]* ws
num ::= [0-9]+ ws
ws ::= [ \t\n]*

13
grammars/chess.gbnf Normal file
View file

@ -0,0 +1,13 @@
# Specifies chess moves as a list in algebraic notation, using PGN conventions
# Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern
root ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+
move ::= (pawn | nonpawn | castle) [+#]?
# piece type, optional file/rank, optional capture, dest file & rank
nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8]
# optional file & capture, dest file & rank, optional promotion
pawn ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])?
castle ::= "O-O" "-O"?

7
grammars/japanese.gbnf Normal file
View file

@ -0,0 +1,7 @@
# A probably incorrect grammar for Japanese
root ::= jp-char+ ([ \t\n] jp-char+)*
jp-char ::= hiragana | katakana | punctuation | cjk
hiragana ::= [ぁ-ゟ]
katakana ::= [ァ-ヿ]
punctuation ::= [、-〾]
cjk ::= [一-鿿]

29
grammars/json.gbnf Normal file
View file

@ -0,0 +1,29 @@
# Grammar for subset of JSON - doesn't support full string or number syntax
root ::= object
value ::= object | array | string | number | boolean | "null"
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}"
array ::=
"[" ws (
value
("," ws value)*
)? "]"
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
# Only plain integers currently
number ::= "-"? [0-9]+ ws
boolean ::= ("true" | "false") ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?

4
grammars/list.gbnf Normal file
View file

@ -0,0 +1,4 @@
root ::= item+
# Excludes various line break characters
item ::= "- " [^\r\n\x0b\x0c\x85\u2028\u2029]+ "\n"

View file

@ -3297,8 +3297,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
#else #else
int8_t aux8[QK_K];
uint8_t aux8[QK_K];
int16_t aux16[16]; int16_t aux16[16];
float sums [8]; float sums [8];
memset(sums, 0, 8*sizeof(float)); memset(sums, 0, 8*sizeof(float));
@ -3308,7 +3307,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
const uint8_t * restrict q4 = x[i].qs; const uint8_t * restrict q4 = x[i].qs;
const uint8_t * restrict hm = x[i].qh; const uint8_t * restrict hm = x[i].qh;
const int8_t * restrict q8 = y[i].qs; const int8_t * restrict q8 = y[i].qs;
uint8_t * restrict a = aux8; int8_t * restrict a = aux8;
for (int l = 0; l < 32; ++l) { for (int l = 0; l < 32; ++l) {
a[l+ 0] = q4[l] & 0xF; a[l+ 0] = q4[l] & 0xF;
a[l+32] = q4[l] >> 4; a[l+32] = q4[l] >> 4;

585
llama.cpp
View file

@ -67,6 +67,7 @@ enum e_model {
MODEL_13B, MODEL_13B,
MODEL_30B, MODEL_30B,
MODEL_65B, MODEL_65B,
MODEL_70B,
}; };
static const size_t kB = 1024; static const size_t kB = 1024;
@ -98,18 +99,18 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
} }
// //
// memory sizes // memory sizes (calculated for n_batch == 512)
// //
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx) static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
/* empirical scaling, still a guess */ { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
{ MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB }, { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
{ MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB }, { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
{ MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB }, { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
{ MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB }, { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB }, { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
}; };
return k_sizes; return k_sizes;
} }
@ -117,38 +118,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1() static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 256ull * MB }, { MODEL_3B, 128ull * MB },
{ MODEL_7B, 512ull * MB }, { MODEL_7B, 160ull * MB },
{ MODEL_13B, 512ull * MB }, { MODEL_13B, 192ull * MB },
{ MODEL_30B, 512ull * MB }, { MODEL_30B, 256ull * MB },
{ MODEL_65B, 1024ull * MB }, { MODEL_65B, 384ull * MB }, // guess
{ MODEL_70B, 304ull * MB },
}; };
return k_sizes; return k_sizes;
} }
// 2*n_embd*n_ctx*n_layer*sizeof(float16) // used to store the compute graph tensors + non-scratch data
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF() static const std::map<e_model, size_t> & MEM_REQ_EVAL()
{ {
static std::map<e_model, size_t> k_sizes = { static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, 682ull * MB }, { MODEL_3B, 8ull * MB },
{ MODEL_7B, 1026ull * MB }, { MODEL_7B, 10ull * MB },
{ MODEL_13B, 1608ull * MB }, { MODEL_13B, 12ull * MB },
{ MODEL_30B, 3124ull * MB }, { MODEL_30B, 16ull * MB },
{ MODEL_65B, 5120ull * MB }, { MODEL_65B, 24ull * MB }, // guess
}; { MODEL_70B, 24ull * MB },
return k_sizes;
}
// this is mostly needed for temporary mul_mat buffers to dequantize the data
// not actually needed if BLAS is disabled
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
{
static std::map<e_model, size_t> k_sizes = {
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
}; };
return k_sizes; return k_sizes;
} }
@ -163,6 +152,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ MODEL_13B, 640ull * kB }, { MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB }, { MODEL_30B, 768ull * kB },
{ MODEL_65B, 1536ull * kB }, { MODEL_65B, 1536ull * kB },
{ MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
}; };
return k_sizes; return k_sizes;
} }
@ -177,6 +167,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ MODEL_13B, 160ull }, { MODEL_13B, 160ull },
{ MODEL_30B, 208ull }, { MODEL_30B, 208ull },
{ MODEL_65B, 416ull }, { MODEL_65B, 416ull },
{ MODEL_70B, 416ull }, // TODO (likely can be reduced)
}; };
return k_sizes; return k_sizes;
} }
@ -188,16 +179,43 @@ struct llama_hparams {
uint32_t n_embd = 4096; uint32_t n_embd = 4096;
uint32_t n_mult = 256; uint32_t n_mult = 256;
uint32_t n_head = 32; uint32_t n_head = 32;
uint32_t n_head_kv = 32;
uint32_t n_layer = 32; uint32_t n_layer = 32;
uint32_t n_rot = 64; uint32_t n_rot = 64;
// LLaMAv2
// TODO: load from model data hparams
float f_ffn_mult = 1.0f;
float f_rms_norm_eps = 1e-6f;
float rope_freq_base = 10000.0f; float rope_freq_base = 10000.0f;
float rope_freq_scale = 1.0f; float rope_freq_scale = 1.0f;
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
bool operator!=(const llama_hparams & other) const { bool operator!=(const llama_hparams & other) const {
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
}
uint32_t n_gqa() const {
return n_head/n_head_kv;
}
uint32_t n_embd_head() const {
return n_embd/n_head;
}
uint32_t n_embd_gqa() const {
return n_embd/n_gqa();
}
size_t kv_size() const {
size_t result = 2ull;
result *= (size_t) n_embd_gqa();
result *= (size_t) n_ctx;
result *= (size_t) n_layer;
result *= sizeof(ggml_fp16_t);
return result;
} }
}; };
@ -505,6 +523,10 @@ struct llama_file_loader {
hparams.n_layer = file.read_u32(); hparams.n_layer = file.read_u32();
hparams.n_rot = file.read_u32(); hparams.n_rot = file.read_u32();
hparams.ftype = (enum llama_ftype) file.read_u32(); hparams.ftype = (enum llama_ftype) file.read_u32();
// LLaMAv2
// TODO: read from header
hparams.n_head_kv = hparams.n_head;
} }
void read_vocab() { void read_vocab() {
vocab.id_to_token.resize(hparams.n_vocab); vocab.id_to_token.resize(hparams.n_vocab);
@ -803,7 +825,7 @@ static bool kv_cache_init(
ggml_type wtype, ggml_type wtype,
int n_ctx, int n_ctx,
int n_gpu_layers) { int n_gpu_layers) {
const int n_embd = hparams.n_embd; const int n_embd = hparams.n_embd_gqa();
const int n_layer = hparams.n_layer; const int n_layer = hparams.n_layer;
const int64_t n_mem = n_layer*n_ctx; const int64_t n_mem = n_layer*n_ctx;
@ -847,9 +869,11 @@ struct llama_context_params llama_context_default_params() {
/*.seed =*/ LLAMA_DEFAULT_SEED, /*.seed =*/ LLAMA_DEFAULT_SEED,
/*.n_ctx =*/ 512, /*.n_ctx =*/ 512,
/*.n_batch =*/ 512, /*.n_batch =*/ 512,
/*.n_gqa =*/ 1,
/*.rms_norm_eps =*/ 1e-6f,
/*.gpu_layers =*/ 0, /*.gpu_layers =*/ 0,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ {0}, /*.tensor_split =*/ nullptr,
/*.rope_freq_base =*/ 10000.0f, /*.rope_freq_base =*/ 10000.0f,
/*.rope_freq_scale =*/ 1.0f, /*.rope_freq_scale =*/ 1.0f,
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
@ -966,6 +990,7 @@ static const char *llama_model_type_name(e_model type) {
case MODEL_13B: return "13B"; case MODEL_13B: return "13B";
case MODEL_30B: return "30B"; case MODEL_30B: return "30B";
case MODEL_65B: return "65B"; case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
default: LLAMA_ASSERT(false); default: LLAMA_ASSERT(false);
} }
} }
@ -976,6 +1001,8 @@ static void llama_model_load_internal(
llama_vocab & vocab, llama_vocab & vocab,
int n_ctx, int n_ctx,
int n_batch, int n_batch,
int n_gqa,
float rms_norm_eps,
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
const float * tensor_split, const float * tensor_split,
@ -997,8 +1024,12 @@ static void llama_model_load_internal(
model.hparams = ml->file_loader->hparams; model.hparams = ml->file_loader->hparams;
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;
llama_file_version file_version = ml->file_loader->file_version; llama_file_version file_version = ml->file_loader->file_version;
auto & hparams = model.hparams; auto & hparams = model.hparams;
// TODO: read from file
hparams.f_rms_norm_eps = rms_norm_eps;
{ {
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 26: model.type = e_model::MODEL_3B; break; case 26: model.type = e_model::MODEL_3B; break;
@ -1016,11 +1047,25 @@ static void llama_model_load_internal(
hparams.n_ctx = n_ctx; hparams.n_ctx = n_ctx;
// LLaMAv2
// TODO: temporary until GGUF
LLAMA_ASSERT(hparams.n_head % n_gqa == 0);
hparams.n_head_kv = hparams.n_head / n_gqa;
if (model.type == e_model::MODEL_65B && n_gqa == 8) {
fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
model.type = e_model::MODEL_70B;
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
}
hparams.rope_freq_base = rope_freq_base; hparams.rope_freq_base = rope_freq_base;
hparams.rope_freq_scale = rope_freq_scale; hparams.rope_freq_scale = rope_freq_scale;
} }
const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199
const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3;
const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw;
const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
//const uint32_t n_ff = 28672;
{ {
fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version));
@ -1029,12 +1074,15 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd);
fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult);
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa());
fprintf(stderr, "%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
} }
@ -1105,6 +1153,7 @@ static void llama_model_load_internal(
size_t vram_scratch = 0; size_t vram_scratch = 0;
{ {
const uint32_t n_embd = hparams.n_embd; const uint32_t n_embd = hparams.n_embd;
const uint32_t n_embd_gqa = hparams.n_embd_gqa();
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
const uint32_t n_vocab = hparams.n_vocab; const uint32_t n_vocab = hparams.n_vocab;
@ -1155,8 +1204,8 @@ static void llama_model_load_internal(
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split);
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split); layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split);
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split); layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split);
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split);
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
@ -1186,11 +1235,11 @@ static void llama_model_load_internal(
mmapped_size - vram_weights + // weights in VRAM not in memory mmapped_size - vram_weights + // weights in VRAM not in memory
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
MEM_REQ_SCRATCH1().at(model.type) + MEM_REQ_SCRATCH1().at(model.type) +
MEM_REQ_EVAL(hparams.n_ctx).at(model.type); MEM_REQ_EVAL().at(model.type);
// this is the memory required by one llama_state // this is the memory required by one llama_state
const size_t mem_required_state = const size_t mem_required_state =
scale*MEM_REQ_KV_SELF().at(model.type); scale*hparams.kv_size();
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@ -1231,7 +1280,7 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
} else { } else {
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__); fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; vram_kv_cache += hparams.kv_size() / 2;
} }
} }
if (n_gpu_layers > (int) hparams.n_layer + 2) { if (n_gpu_layers > (int) hparams.n_layer + 2) {
@ -1239,7 +1288,7 @@ static void llama_model_load_internal(
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
} else { } else {
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__); fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; vram_kv_cache += hparams.kv_size() / 2;
} }
} }
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
@ -1287,9 +1336,11 @@ static bool llama_model_load(
llama_vocab & vocab, llama_vocab & vocab,
int n_ctx, int n_ctx,
int n_batch, int n_batch,
int n_gqa,
float rms_norm_eps,
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
float * tensor_split, const float * tensor_split,
float rope_freq_base, float rope_freq_base,
float rope_freq_scale, float rope_freq_scale,
bool low_vram, bool low_vram,
@ -1300,7 +1351,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void *progress_callback_user_data) { void *progress_callback_user_data) {
try { try {
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true; return true;
} catch (const std::exception & err) { } catch (const std::exception & err) {
@ -1344,16 +1395,23 @@ static bool llama_eval_internal(
LLAMA_ASSERT(!!kv_self.ctx); LLAMA_ASSERT(!!kv_self.ctx);
const int n_embd = hparams.n_embd; const int64_t n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer; const int64_t n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx; const int64_t n_ctx = hparams.n_ctx;
const int n_head = hparams.n_head; const int64_t n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab; const int64_t n_head_kv = hparams.n_head_kv;
const int n_rot = hparams.n_embd/hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head();
const int n_gpu_layers = model.n_gpu_layers; const int64_t n_vocab = hparams.n_vocab;
const int64_t n_embd_gqa = hparams.n_embd_gqa();
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
const float freq_base = hparams.rope_freq_base; const float freq_base = hparams.rope_freq_base;
const float freq_scale = hparams.rope_freq_scale; const float freq_scale = hparams.rope_freq_scale;
const float rms_norm_eps = hparams.f_rms_norm_eps;
const int n_gpu_layers = model.n_gpu_layers;
auto & mem_per_token = lctx.mem_per_token; auto & mem_per_token = lctx.mem_per_token;
auto & buf_compute = lctx.buf_compute; auto & buf_compute = lctx.buf_compute;
@ -1431,7 +1489,7 @@ static bool llama_eval_internal(
// norm // norm
{ {
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
offload_func(cur); offload_func(cur);
ggml_set_name(cur, "rms_norm_0"); ggml_set_name(cur, "rms_norm_0");
@ -1452,11 +1510,11 @@ static bool llama_eval_internal(
offload_func_kq(tmpq); offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq"); ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
offload_func_kq(Kcur); offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur"); ggml_set_name(Kcur, "Kcur");
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0); struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
offload_func_kq(Qcur); offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur"); ggml_set_name(Qcur, "Qcur");
@ -1468,17 +1526,17 @@ static bool llama_eval_internal(
offload_func_v(tmpv); offload_func_v(tmpv);
ggml_set_name(tmpv, "tmpv"); ggml_set_name(tmpv, "tmpv");
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N)); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
offload_func_v(Vcur); offload_func_v(Vcur);
ggml_set_name(Vcur, "Vcur"); ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
offload_func_kq(k); offload_func_kq(k);
ggml_set_name(k, "k"); ggml_set_name(k, "k");
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v), ( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
offload_func_v(v); offload_func_v(v);
ggml_set_name(v, "v"); ggml_set_name(v, "v");
@ -1497,8 +1555,8 @@ static bool llama_eval_internal(
struct ggml_tensor * K = struct ggml_tensor * K =
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_reshape_3d(ctx0, ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa),
n_embd/n_head, n_head, n_past + N), n_embd_head, n_head_kv, n_past + N),
0, 2, 1, 3); 0, 2, 1, 3);
offload_func_kq(K); offload_func_kq(K);
ggml_set_name(K, "K"); ggml_set_name(K, "K");
@ -1508,9 +1566,9 @@ static bool llama_eval_internal(
offload_func_kq(KQ); offload_func_kq(KQ);
ggml_set_name(KQ, "KQ"); ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled = KQ / sqrt(n_embd_head)
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
// KQ_scaled shape [n_past + N, N, n_head, 1] // KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
@ -1530,10 +1588,10 @@ static bool llama_eval_internal(
// split cached V into n_head heads // split cached V into n_head heads
struct ggml_tensor * V = struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v, ggml_view_3d(ctx0, kv_self.v,
n_past + N, n_embd/n_head, n_head, n_past + N, n_embd_head, n_head_kv,
n_ctx*ggml_element_size(kv_self.v), n_ctx*ggml_element_size(kv_self.v),
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, n_ctx*ggml_element_size(kv_self.v)*n_embd_head,
il*n_ctx*ggml_element_size(kv_self.v)*n_embd); n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il);
offload_func_v(V); offload_func_v(V);
ggml_set_name(V, "V"); ggml_set_name(V, "V");
@ -1545,7 +1603,7 @@ static bool llama_eval_internal(
// make V contiguous in memory to speed up the matmul, however we waste time on the copy // make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
// is there a better way? // is there a better way?
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
#endif #endif
@ -1579,7 +1637,7 @@ static bool llama_eval_internal(
{ {
// norm // norm
{ {
cur = ggml_rms_norm(ctx0, inpFF); cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
offload_func(cur); offload_func(cur);
ggml_set_name(cur, "rms_norm_1"); ggml_set_name(cur, "rms_norm_1");
@ -1632,7 +1690,7 @@ static bool llama_eval_internal(
// norm // norm
{ {
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
offload_func_nr(cur); offload_func_nr(cur);
ggml_set_name(cur, "rms_norm_2"); ggml_set_name(cur, "rms_norm_2");
@ -1739,10 +1797,12 @@ static bool llama_eval_internal(
} }
#if 0 #if 0
printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__, printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
ggml_used_mem(ctx0)/1024.0/1024.0, ggml_used_mem(ctx0)/1024.0/1024.0,
lctx.get_buf_max_mem(0)/1024.0/1024.0, lctx.get_buf_max_mem(0)/1024.0/1024.0,
lctx.get_buf_max_mem(1)/1024.0/1024.0); lctx.get_buf_max_mem(1)/1024.0/1024.0,
lctx.work_buffer.size()/1024.0/1024.0,
n_past, N);
#endif #endif
ggml_free(ctx0); ggml_free(ctx0);
@ -1915,6 +1975,279 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
return output; return output;
} }
//
// grammar - internal
//
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
};
// NOTE: assumes valid utf8 (but checks for overrun)
// adds a terminating 0 for use as pointer
std::vector<uint32_t> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
const char * pos = src;
std::vector<uint32_t> code_points;
while (*pos != 0) {
uint8_t first_byte = static_cast<uint8_t>(*pos);
uint8_t highbits = first_byte >> 4;
int len = lookup[highbits];
uint8_t mask = (1 << (8 - len)) - 1;
uint32_t value = first_byte & mask;
const char * end = pos + len; // may overrun!
++pos;
for ( ; pos < end && *pos != 0; ++pos) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
}
code_points.push_back(value);
}
code_points.push_back(0);
return code_points;
}
// returns true iff pos points to the end of one of the definitions of a rule
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
switch (pos->type) {
case LLAMA_GRETYPE_END: return true;
case LLAMA_GRETYPE_ALT: return true;
default: return false;
}
}
// returns true iff chr satisfies the char range at pos (regular or inverse range)
// asserts that pos is pointing to a char range element
static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
const llama_grammar_element * pos,
const uint32_t chr) {
bool found = false;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
LLAMA_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
do {
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
// inclusive range, e.g. [a-z]
found = found || (pos->value <= chr && chr <= pos[1].value);
pos += 2;
} else {
// exact char match, e.g. [a] or "a"
found = found || pos->value == chr;
pos += 1;
}
} while (pos->type == LLAMA_GRETYPE_CHAR_ALT);
return std::make_pair(found == is_positive_char, pos);
}
// transforms a grammar pushdown stack into N possible stacks, all ending
// at a character range (terminal element)
static void llama_grammar_advance_stack(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<const llama_grammar_element *> & stack,
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
if (stack.empty()) {
new_stacks.push_back(stack);
return;
}
const llama_grammar_element * pos = stack.back();
switch (pos->type) {
case LLAMA_GRETYPE_RULE_REF: {
const size_t rule_id = static_cast<size_t>(pos->value);
const llama_grammar_element * subpos = rules[rule_id].data();
do {
// init new stack without the top (pos)
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
if (!llama_grammar_is_end_of_sequence(pos + 1)) {
// if this rule ref is followed by another element, add that to stack
new_stack.push_back(pos + 1);
}
if (!llama_grammar_is_end_of_sequence(subpos)) {
// if alternate is nonempty, add to stack
new_stack.push_back(subpos);
}
llama_grammar_advance_stack(rules, new_stack, new_stacks);
while (!llama_grammar_is_end_of_sequence(subpos)) {
// scan to end of alternate def
subpos++;
}
if (subpos->type == LLAMA_GRETYPE_ALT) {
// there's another alternate def of this rule to process
subpos++;
} else {
break;
}
} while (true);
break;
}
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
new_stacks.push_back(stack);
break;
default:
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
// those
LLAMA_ASSERT(false);
}
}
// takes a set of possible pushdown stacks on a grammar, which are required to
// be positioned at a character range (see `llama_grammar_advance_stack`), and
// produces the N possible stacks if the given char is accepted at those
// positions
static std::vector<std::vector<const llama_grammar_element *>> llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const uint32_t chr) {
std::vector<std::vector<const llama_grammar_element *>> new_stacks;
for (const auto & stack : stacks) {
if (stack.empty()) {
continue;
}
auto match = llama_grammar_match_char(stack.back(), chr);
if (match.first) {
const llama_grammar_element * pos = match.second;
// update top of stack to next element, if any
std::vector<const llama_grammar_element *> new_stack(stack.begin(), stack.end() - 1);
if (!llama_grammar_is_end_of_sequence(pos)) {
new_stack.push_back(pos);
}
llama_grammar_advance_stack(rules, new_stack, new_stacks);
}
}
return new_stacks;
}
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const std::vector<llama_grammar_candidate> & candidates);
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<const llama_grammar_element *> & stack,
const std::vector<llama_grammar_candidate> & candidates) {
std::vector<llama_grammar_candidate> rejects;
if (stack.empty()) {
// accept nothing; EOS is handled elsewhere
rejects.insert(rejects.end(), candidates.begin(), candidates.end());
return rejects;
}
const llama_grammar_element * stack_pos = stack.back();
std::vector<llama_grammar_candidate> next_candidates;
for (auto tok : candidates) {
if (llama_grammar_match_char(stack_pos, tok.code_points[0]).first) {
if (tok.code_points[1] != 0) {
next_candidates.push_back({ tok.index, tok.code_points + 1 });
}
} else {
rejects.push_back(tok);
}
}
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
// update top of stack to next element, if any
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
if (!llama_grammar_is_end_of_sequence(stack_pos_after)) {
stack_after.push_back(stack_pos_after);
}
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
llama_grammar_advance_stack(rules, stack_after, next_stacks);
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
for (auto tok : next_rejects) {
rejects.push_back({ tok.index, tok.code_points - 1 });
}
return rejects;
}
static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const std::vector<llama_grammar_candidate> & candidates) {
LLAMA_ASSERT(!stacks.empty()); // REVIEW
if (candidates.empty()) {
return std::vector<llama_grammar_candidate>();
}
auto rejects = llama_grammar_reject_candidates_for_stack(rules, stacks.front(), candidates);
for (size_t i = 1, size = stacks.size(); i < size; ++i) {
rejects = llama_grammar_reject_candidates_for_stack(rules, stacks[i], rejects);
}
return rejects;
}
//
// grammar - external
//
struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index) {
const llama_grammar_element * pos;
// copy rule definitions into vectors
std::vector<std::vector<llama_grammar_element>> vec_rules(n_rules);
for (size_t i = 0; i < n_rules; i++) {
for (pos = rules[i]; pos->type != LLAMA_GRETYPE_END; pos++) {
vec_rules[i].push_back(*pos);
}
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
}
// loop over alternates of start rule to build initial stacks
std::vector<std::vector<const llama_grammar_element *>> stacks;
pos = rules[start_rule_index];
do {
std::vector<const llama_grammar_element *> stack;
if (!llama_grammar_is_end_of_sequence(pos)) {
// if alternate is nonempty, add to stack
stack.push_back(pos);
}
llama_grammar_advance_stack(vec_rules, stack, stacks);
while (!llama_grammar_is_end_of_sequence(pos)) {
// scan to end of alternate def
pos++;
}
if (pos->type == LLAMA_GRETYPE_ALT) {
// there's another alternate def of this rule to process
pos++;
} else {
break;
}
} while (true);
return new llama_grammar{ std::move(vec_rules), std::move(stacks) };
}
void llama_grammar_free(struct llama_grammar * grammar) {
delete grammar;
}
// //
// sampling // sampling
// //
@ -2200,6 +2533,47 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
} }
} }
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
assert(ctx);
const int64_t t_start_sample_us = ggml_time_us();
bool allow_eos = false;
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
allow_eos = true;
break;
}
}
const llama_token eos = llama_token_eos();
std::vector<std::vector<uint32_t>> candidates_decoded;
std::vector<llama_grammar_candidate> candidates_grammar;
for (size_t i = 0; i < candidates->size; ++i) {
const llama_token id = candidates->data[i].id;
const char * str = llama_token_to_str(ctx, id);
if (id == eos) {
if (!allow_eos) {
candidates->data[i].logit = -INFINITY;
}
} else if (*str == 0) {
candidates->data[i].logit = -INFINITY;
} else {
candidates_decoded.push_back(decode_utf8(str));
candidates_grammar.push_back({ i, candidates_decoded.back().data() });
}
}
const auto rejects =
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
for (auto & reject : rejects) {
candidates->data[reject.index].logit = -INFINITY;
}
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
static void llama_log_softmax(float * array, size_t size) { static void llama_log_softmax(float * array, size_t size) {
float max_l = *std::max_element(array, array + size); float max_l = *std::max_element(array, array + size);
float sum = 0.f; float sum = 0.f;
@ -2218,8 +2592,7 @@ void llama_sample_classifier_free_guidance(
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates, llama_token_data_array * candidates,
struct llama_context * guidance_ctx, struct llama_context * guidance_ctx,
float scale, float scale) {
float smooth_factor) {
int64_t t_start_sample_us = ggml_time_us(); int64_t t_start_sample_us = ggml_time_us();
assert(ctx); assert(ctx);
@ -2240,16 +2613,7 @@ void llama_sample_classifier_free_guidance(
for (int i = 0; i < n_vocab; ++i) { for (int i = 0; i < n_vocab; ++i) {
float logit_guidance = logits_guidance[i]; float logit_guidance = logits_guidance[i];
float logit_base = logits_base[i]; float logit_base = logits_base[i];
logits_guidance[i] = scale * (logit_base - logit_guidance) + logit_guidance; candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
}
llama_log_softmax(logits_guidance, n_vocab);
for (int i = 0; i < n_vocab; ++i) {
float logit_base = logits_base[i];
float logit_guidance = logits_guidance[i];
candidates->data[i].logit = smooth_factor * logit_guidance + (1.f - smooth_factor) * logit_base;
} }
if (ctx) { if (ctx) {
@ -2385,6 +2749,29 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
return result; return result;
} }
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
const int64_t t_start_sample_us = ggml_time_us();
if (token == llama_token_eos()) {
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
return;
}
}
LLAMA_ASSERT(false);
}
const char * str = llama_token_to_str(ctx, token);
// Note terminating 0 in decoded string
auto code_points = decode_utf8(str);
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
}
LLAMA_ASSERT(!grammar->stacks.empty());
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
// //
// quantization // quantization
// //
@ -2543,16 +2930,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else { } else {
new_type = quantized_type; new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
bool convert_incompatible_tensor = false;
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
int nx = tensor.ne.at(0);
int ny = tensor.ne.at(1);
if (nx % QK_K != 0 || ny % QK_K != 0) {
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
convert_incompatible_tensor = true;
}
}
if (tensor.name == "output.weight") { if (tensor.name == "output.weight") {
int nx = tensor.ne.at(0); int nx = tensor.ne.at(0);
int ny = tensor.ne.at(1); int ny = tensor.ne.at(1);
@ -2578,6 +2955,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
} }
bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
int nx = tensor.ne.at(0);
int ny = tensor.ne.at(1);
if (nx % QK_K != 0 || ny % QK_K != 0) {
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
convert_incompatible_tensor = true;
}
}
if (convert_incompatible_tensor) { if (convert_incompatible_tensor) {
if (tensor.name == "output.weight") { if (tensor.name == "output.weight") {
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
@ -2604,7 +2991,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
f32_data = (float *) f32_conv_buf.addr; f32_data = (float *) f32_conv_buf.addr;
} }
printf("quantizing .. "); printf("quantizing to %s .. ", ggml_type_name(new_type));
fflush(stdout); fflush(stdout);
work.resize(nelements * 4); // upper bound on size work.resize(nelements * 4); // upper bound on size
@ -2707,7 +3094,7 @@ struct llama_model * llama_load_model_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
params.progress_callback_user_data)) { params.progress_callback_user_data)) {
@ -2785,7 +3172,7 @@ struct llama_context * llama_new_context_with_model(
ctx->embedding.resize(hparams.n_embd); ctx->embedding.resize(hparams.n_embd);
} }
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type)); ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
@ -2809,7 +3196,7 @@ struct llama_context * llama_new_context_with_model(
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
#define LLAMA_METAL_CHECK_BUF(result) \ #define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \ if (!(result)) { \

58
llama.h
View file

@ -86,9 +86,12 @@ extern "C" {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
int32_t n_ctx; // text context int32_t n_ctx; // text context
int32_t n_batch; // prompt processing batch size int32_t n_batch; // prompt processing batch size
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t main_gpu; // the GPU that is used for scratch and small tensors int32_t main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency float rope_freq_base; // RoPE base frequency
@ -139,6 +142,40 @@ extern "C" {
bool quantize_output_tensor; // quantize output.weight bool quantize_output_tensor; // quantize output.weight
} llama_model_quantize_params; } llama_model_quantize_params;
// grammar types
struct llama_grammar;
// grammar element type
enum llama_gretype {
// end of rule definition
LLAMA_GRETYPE_END = 0,
// start of alternate definition for rule
LLAMA_GRETYPE_ALT = 1,
// non-terminal element: reference to rule
LLAMA_GRETYPE_RULE_REF = 2,
// terminal element: character (code point)
LLAMA_GRETYPE_CHAR = 3,
// inverse char(s) ([^a], [^a-b] [^abc])
LLAMA_GRETYPE_CHAR_NOT = 4,
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z])
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
// modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6,
};
typedef struct llama_grammar_element {
enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID
} llama_grammar_element;
// performance timing information // performance timing information
struct llama_timings { struct llama_timings {
double t_start_ms; double t_start_ms;
@ -331,6 +368,15 @@ extern "C" {
LLAMA_API llama_token llama_token_eos(); // end-of-sentence LLAMA_API llama_token llama_token_eos(); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); // next-line LLAMA_API llama_token llama_token_nl(); // next-line
// Grammar
//
LLAMA_API struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index);
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
// Sampling functions // Sampling functions
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@ -343,13 +389,11 @@ extern "C" {
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
/// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
LLAMA_API void llama_sample_classifier_free_guidance( LLAMA_API void llama_sample_classifier_free_guidance(
struct llama_context * ctx, struct llama_context * ctx,
llama_token_data_array * candidates, llama_token_data_array * candidates,
struct llama_context * guidance_ctx, struct llama_context * guidance_ctx,
float scale, float scale);
float smooth_factor);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
@ -367,6 +411,9 @@ extern "C" {
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@ -388,6 +435,9 @@ extern "C" {
/// @details Randomly selects a token from the candidates based on their probabilities. /// @details Randomly selects a token from the candidates based on their probabilities.
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
// Performance information // Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx);

View file

@ -64,7 +64,7 @@ void get_random_dims(int64_t * dims, int ndims) {
} }
} }
struct ggml_tensor * get_random_tensor( struct ggml_tensor * get_random_tensor_f32(
struct ggml_context * ctx0, struct ggml_context * ctx0,
int ndims, int ndims,
int64_t ne[], int64_t ne[],
@ -112,7 +112,55 @@ struct ggml_tensor * get_random_tensor(
return result; return result;
} }
struct ggml_tensor * get_random_tensor_int( struct ggml_tensor * get_random_tensor_f16(
struct ggml_context * ctx0,
int ndims,
int64_t ne[],
float fmin,
float fmax) {
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
switch (ndims) {
case 1:
for (int i0 = 0; i0 < ne[0]; i0++) {
((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
}
break;
case 2:
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
}
}
break;
case 3:
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
}
}
}
break;
case 4:
for (int i3 = 0; i3 < ne[3]; i3++) {
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
}
}
}
}
break;
default:
assert(false);
};
return result;
}
struct ggml_tensor * get_random_tensor_i32(
struct ggml_context * ctx0, struct ggml_context * ctx0,
int ndims, int ndims,
int64_t ne[], int64_t ne[],
@ -160,23 +208,6 @@ struct ggml_tensor * get_random_tensor_int(
return result; return result;
} }
float get_element(const struct ggml_tensor * t, int idx) {
if (t->type == GGML_TYPE_F32) {
return ((float *)t->data)[idx];
}
if (t->type == GGML_TYPE_I32) {
return ((int32_t *)t->data)[idx];
}
assert(false);
return INFINITY;
}
void set_element(struct ggml_tensor * t, int idx, float value) {
((float *)t->data)[idx] = value;
}
void print_elements(const char* label, const struct ggml_tensor * t) { void print_elements(const char* label, const struct ggml_tensor * t) {
if (!t) { if (!t) {
printf("%s: %s = null\n", __func__, label); printf("%s: %s = null\n", __func__, label);
@ -186,7 +217,7 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
printf("%s: %s = [", __func__, label); printf("%s: %s = [", __func__, label);
for (int k = 0; k < nelements; ++k) { for (int k = 0; k < nelements; ++k) {
if (k > 0) { printf(", "); } if (k > 0) { printf(", "); }
printf("%.5f", get_element(t, k)); printf("%.5f", ggml_get_f32_1d(t, k));
} }
printf("] shape: ["); printf("] shape: [");
for (int k = 0; k < t->n_dims; ++k) { for (int k = 0; k < t->n_dims; ++k) {
@ -237,23 +268,23 @@ bool check_gradient(
const int nelements = ggml_nelements(x[i]); const int nelements = ggml_nelements(x[i]);
for (int k = 0; k < nelements; ++k) { for (int k = 0; k < nelements; ++k) {
// compute gradient using finite differences // compute gradient using finite differences
const float x0 = get_element(x[i], k); const float x0 = ggml_get_f32_1d(x[i], k);
const float xm = x0 - eps; const float xm = x0 - eps;
const float xp = x0 + eps; const float xp = x0 + eps;
set_element(x[i], k, xp); ggml_set_f32_1d(x[i], k, xp);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
const float f0 = ggml_get_f32_1d(f, 0); const float f0 = ggml_get_f32_1d(f, 0);
set_element(x[i], k, xm); ggml_set_f32_1d(x[i], k, xm);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
const float f1 = ggml_get_f32_1d(f, 0); const float f1 = ggml_get_f32_1d(f, 0);
const float g0 = (f0 - f1)/(2.0f*eps); const float g0 = (f0 - f1)/(2.0f*eps);
set_element(x[i], k, x0); ggml_set_f32_1d(x[i], k, x0);
// compute gradient using backward graph // compute gradient using backward graph
ggml_graph_reset (&gf); ggml_graph_reset (&gf);
@ -261,7 +292,7 @@ bool check_gradient(
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
const float g1 = get_element(x[i]->grad, k); const float g1 = ggml_get_f32_1d(x[i]->grad, k);
const float error_abs = fabsf(g0 - g1); const float error_abs = fabsf(g0 - g1);
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0; const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
@ -392,19 +423,35 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * x[MAX_NARGS]; struct ggml_tensor * x[MAX_NARGS];
// add // add f32
{ {
const int nargs = 2; const int nargs = 2;
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
check_gradient("add", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f); check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
}
}
// add f16
{
const int nargs = 2;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
} }
} }
@ -414,7 +461,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -430,7 +477,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -446,7 +493,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, 0.5f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -462,7 +509,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -478,7 +525,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -494,7 +541,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -510,7 +557,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -527,7 +574,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -537,6 +584,40 @@ int main(int argc, const char ** argv) {
} }
} }
// mean, not yet fully implemented
if(0)
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
}
}
// argmax
if (0)
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
}
}
// repeat // repeat
{ {
int64_t ne2[4]; int64_t ne2[4];
@ -549,15 +630,36 @@ int main(int argc, const char ** argv) {
const int nargs = 1; const int nargs = 1;
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1])))); struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY); check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
} }
}
// repeat back
{
int64_t ne2[4];
get_random_dims(ne2, 4);
ne2[0] = ne[0] * ne2[0];
ne2[1] = ne[1] * ne2[1];
ne2[2] = 1;
ne2[3] = 1;
const int nargs = 1;
for (int ndims = 1; ndims <= 2; ++ndims) {
x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
}
} }
// abs (finite differences do not work) // abs (finite differences do not work)
@ -566,7 +668,7 @@ int main(int argc, const char ** argv) {
// for (int ndims = 1; ndims <= 2; ++ndims) { // for (int ndims = 1; ndims <= 2; ++ndims) {
// for (int i = 0; i < nargs; ++i) { // for (int i = 0; i < nargs; ++i) {
// x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); // x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
// ggml_set_param(ctx0, x[i]); // ggml_set_param(ctx0, x[i]);
// } // }
@ -576,17 +678,82 @@ int main(int argc, const char ** argv) {
// } // }
//} //}
// sgn
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
}
}
// neg
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
}
}
// step
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
}
}
// tanh, not yet fully implemented
if(0)
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
}
}
// mul_mat // mul_mat
{ {
const int nargs = 2; const int nargs = 2;
for (int ndims = 2; ndims <= 2; ++ndims) { for (int ndims = 2; ndims <= 2; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
{ {
int64_t ne2[4]; int64_t ne2[4];
get_random_dims(ne2, 4); get_random_dims(ne2, 4);
ne2[0] = ne[0]; ne2[0] = ne[0];
x[1] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
} }
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
@ -602,13 +769,63 @@ int main(int argc, const char ** argv) {
} }
} }
// elu, not yet fully implemented
if(0)
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
}
}
// relu
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
}
}
// gelu, not yet fully implemented
if(0)
{
const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
}
}
// silu // silu
{ {
const int nargs = 1; const int nargs = 1;
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
@ -629,11 +846,11 @@ int main(int argc, const char ** argv) {
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY); check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
} }
@ -647,8 +864,8 @@ int main(int argc, const char ** argv) {
ne2[0] = 1; ne2[0] = 1;
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
ggml_set_param(ctx0, x[1]); ggml_set_param(ctx0, x[1]);
@ -659,20 +876,37 @@ int main(int argc, const char ** argv) {
} }
} }
// cpy // cpy f32
{ {
const int nargs = 2; const int nargs = 2;
for (int ndims = 1; ndims <= 2; ++ndims) { for (int ndims = 1; ndims <= 2; ++ndims) {
for (int i = 0; i < nargs; ++i) { for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]); ggml_set_param(ctx0, x[i]);
} }
// x[1] is overwritten by x[0], so the gradients don't propagate to x[1] // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
check_gradient("cpy", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
}
}
// cpy f16
{
const int nargs = 2;
for (int ndims = 1; ndims <= 2; ++ndims) {
for (int i = 0; i < nargs; ++i) {
x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[i]);
}
// x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
} }
} }
@ -689,8 +923,8 @@ int main(int argc, const char ** argv) {
for (int i = 0; i < ndims; ++i) { for (int i = 0; i < ndims; ++i) {
ne2[0] *= ne[i]; ne2[0] *= ne[i];
} }
x[0] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
x[1] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
@ -712,8 +946,8 @@ int main(int argc, const char ** argv) {
for (int i = 0; i < ndims; ++i) { for (int i = 0; i < ndims; ++i) {
ne2[0] *= ne[i]; ne2[0] *= ne[i];
} }
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
@ -729,7 +963,7 @@ int main(int argc, const char ** argv) {
const int nargs = 2; const int nargs = 2;
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
get_random_dims(ne2, 1); get_random_dims(ne2, 1);
@ -737,7 +971,7 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 1); get_random_dims(ne2, 1);
} }
x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[1]); ggml_set_param(ctx0, x[1]);
const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1])); const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@ -758,7 +992,7 @@ int main(int argc, const char ** argv) {
const int nargs = 2; const int nargs = 2;
for (int ndims = 2; ndims <= 4; ++ndims) { for (int ndims = 2; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
get_random_dims(ne2, 2); get_random_dims(ne2, 2);
@ -766,7 +1000,7 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 2); get_random_dims(ne2, 2);
} }
x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[1]); ggml_set_param(ctx0, x[1]);
max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -790,7 +1024,7 @@ int main(int argc, const char ** argv) {
const int nargs = 2; const int nargs = 2;
for (int ndims = 3; ndims <= 4; ++ndims) { for (int ndims = 3; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
get_random_dims(ne2, 3); get_random_dims(ne2, 3);
@ -798,7 +1032,7 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 3); get_random_dims(ne2, 3);
} }
x[1] = get_random_tensor(ctx0, 3, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[1]); ggml_set_param(ctx0, x[1]);
max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -824,7 +1058,7 @@ int main(int argc, const char ** argv) {
const int nargs = 2; const int nargs = 2;
for (int ndims = 4; ndims <= 4; ++ndims) { for (int ndims = 4; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
get_random_dims(ne2, 4); get_random_dims(ne2, 4);
@ -832,7 +1066,7 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 4); get_random_dims(ne2, 4);
} }
x[1] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[1]); ggml_set_param(ctx0, x[1]);
max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -858,7 +1092,7 @@ int main(int argc, const char ** argv) {
const int nargs = 2; const int nargs = 2;
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
get_random_dims(ne2, 1); get_random_dims(ne2, 1);
@ -866,7 +1100,7 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 1); get_random_dims(ne2, 1);
} }
x[1] = get_random_tensor(ctx0, 1, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[1]); ggml_set_param(ctx0, x[1]);
const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1])); const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
@ -887,7 +1121,7 @@ int main(int argc, const char ** argv) {
const int nargs = 1; const int nargs = 1;
for (int ndims = 2; ndims <= 4; ++ndims) { for (int ndims = 2; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
get_random_dims(ne2, 2); get_random_dims(ne2, 2);
@ -895,7 +1129,7 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 2); get_random_dims(ne2, 2);
} }
x[1] = get_random_tensor(ctx0, 2, ne2, -1.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[1]); ggml_set_param(ctx0, x[1]);
max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]); max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
@ -915,7 +1149,7 @@ int main(int argc, const char ** argv) {
const int nargs = 1; const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
@ -941,7 +1175,7 @@ int main(int argc, const char ** argv) {
const int nargs = 1; const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
get_random_dims(ne2, 2); get_random_dims(ne2, 2);
while (ne2[0]*ne2[1] > ggml_nelements(x[0])) { while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
@ -971,7 +1205,7 @@ int main(int argc, const char ** argv) {
const int nargs = 1; const int nargs = 1;
for (int ndims = 1; ndims <= 4; ++ndims) { for (int ndims = 1; ndims <= 4; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
get_random_dims(ne2, 3); get_random_dims(ne2, 3);
while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) { while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
@ -1010,7 +1244,7 @@ int main(int argc, const char ** argv) {
for (int i=ndims; i<4; ++i) { for (int i=ndims; i<4; ++i) {
ne2[i] = 1; ne2[i] = 1;
} }
x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
@ -1043,7 +1277,7 @@ int main(int argc, const char ** argv) {
for (int i=ndims; i<4; ++i) { for (int i=ndims; i<4; ++i) {
ne2[i] = 1; ne2[i] = 1;
} }
x[0] = get_random_tensor(ctx0, 4, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
@ -1060,8 +1294,8 @@ int main(int argc, const char ** argv) {
int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1}; int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
const int nargs = 1; const int nargs = 1;
const int ndims = 2; const int ndims = 2;
x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
x[1] = get_random_tensor_int(ctx0, 1, ne3, 0, ne2[1]); x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
@ -1075,7 +1309,7 @@ int main(int argc, const char ** argv) {
const int nargs = 1; const int nargs = 1;
const int ndims = 2; const int ndims = 2;
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
int n_past = irand(ne[0]); int n_past = irand(ne[0]);
@ -1090,7 +1324,7 @@ int main(int argc, const char ** argv) {
const int nargs = 1; const int nargs = 1;
const int ndims = 2; const int ndims = 2;
x[0] = get_random_tensor(ctx0, ndims, ne, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
int n_past = irand(ne[0]); int n_past = irand(ne[0]);
@ -1108,7 +1342,7 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 4); get_random_dims(ne2, 4);
for (int ndims = 1; ndims <= 3; ++ndims) { for (int ndims = 1; ndims <= 3; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));
@ -1125,8 +1359,8 @@ int main(int argc, const char ** argv) {
get_random_dims(ne2, 4); get_random_dims(ne2, 4);
for (int ndims = 1; ndims <= 3; ++ndims) { for (int ndims = 1; ndims <= 3; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f); x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1])); struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
@ -1136,7 +1370,7 @@ int main(int argc, const char ** argv) {
} }
} }
// rope // rope f32
{ {
const int nargs = 1; const int nargs = 1;
@ -1148,7 +1382,7 @@ int main(int argc, const char ** argv) {
for (int ndims = 3; ndims <= 4; ++ndims) { for (int ndims = 3; ndims <= 4; ++ndims) {
for (int mode = 0; mode < 4; ++mode) { for (int mode = 0; mode < 4; ++mode) {
for (int n_past = 1; n_past < ne2[2]; ++n_past) { for (int n_past = 1; n_past < ne2[2]; ++n_past) {
x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f); x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
@ -1163,14 +1397,48 @@ int main(int argc, const char ** argv) {
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0)); struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode); GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY); check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
} }
} }
} }
} }
// flash_attn // rope f16
{
const int nargs = 1;
int64_t ne2[4];
get_random_dims(ne2, 4);
ne2[0] += ne2[0] % 2;
int n_rot = ne2[0];
for (int ndims = 3; ndims <= 4; ++ndims) {
for (int mode = 0; mode < 4; ++mode) {
for (int n_past = 1; n_past < ne2[2]; ++n_past) {
x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
ggml_set_param(ctx0, x[0]);
const bool skip_past = (mode & 1);
if (skip_past) {
// we have no past, so this would have to work on uninitialized memory.
// we only test the gradients here;
// skip_past should have no influence on gradient computation.
// so when other modes work, we assume that this does as well.
continue;
}
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
}
}
}
}
// flash_attn f32
{ {
const int nargs = 3; const int nargs = 3;
@ -1196,16 +1464,57 @@ int main(int argc, const char ** argv) {
nek[3] = 1; nek[3] = 1;
nev[3] = 1; nev[3] = 1;
} }
x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f); x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f); x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f); x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
ggml_set_param(ctx0, x[0]); ggml_set_param(ctx0, x[0]);
ggml_set_param(ctx0, x[1]); ggml_set_param(ctx0, x[1]);
ggml_set_param(ctx0, x[2]); ggml_set_param(ctx0, x[2]);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f); check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
}
}
}
// flash_attn f16, not yet fully implemented
if(0)
{
const int nargs = 3;
int64_t ne2[4];
get_random_dims(ne2, 4);
int64_t D = ne2[0];
int64_t N = ne2[1];
int64_t M = ne2[2] + N;
int64_t B = ne2[3];
for (int masked = 0; masked <= 1; ++masked) {
for (int ndims = 2; ndims <= 4; ++ndims) {
int64_t neq[4] = { D, N, B, ne[3] };
int64_t nek[4] = { D, M, B, ne[3] };
int64_t nev[4] = { M, D, B, ne[3] };
if (ndims == 2) {
neq[2] = 1; neq[3] = 1;
nek[2] = 1; nek[3] = 1;
nev[2] = 1; nev[3] = 1;
} else if (ndims == 3) {
neq[3] = 1;
nek[3] = 1;
nev[3] = 1;
}
x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
ggml_set_param(ctx0, x[0]);
ggml_set_param(ctx0, x[1]);
ggml_set_param(ctx0, x[2]);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
} }
} }
} }

View file

@ -125,9 +125,9 @@ int main(void) {
}; };
struct ggml_context * ctx = ggml_init(params); struct ggml_context * ctx = ggml_init(params);
int64_t ne1[4] = {4, 1024, 1, 1}; int64_t ne1[4] = {4, 128, 1, 1};
int64_t ne2[4] = {4, 2048, 1, 1};; int64_t ne2[4] = {4, 256, 1, 1};;
int64_t ne3[4] = {1024, 2048, 1, 1}; int64_t ne3[4] = {128, 256, 1, 1};
struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1); struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1); struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);