diff --git a/.devops/lamma-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec similarity index 56% rename from .devops/lamma-cpp-clblast.srpm.spec rename to .devops/llama-cpp-clblast.srpm.spec index 739c68281..076f29695 100644 --- a/.devops/lamma-cpp-clblast.srpm.spec +++ b/.devops/llama-cpp-clblast.srpm.spec @@ -13,12 +13,13 @@ # It is up to the user to install the correct vendor-specific support. Name: llama.cpp-clblast -Version: master +Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} -Summary: OpenCL Inference of LLaMA model in pure C/C++ +Summary: OpenCL Inference of LLaMA model in C/C++ License: MIT Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz -BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel +BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel +Requires: clblast URL: https://github.com/ggerganov/llama.cpp %define debug_package %{nil} @@ -35,18 +36,43 @@ make -j LLAMA_CLBLAST=1 %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llamacppclblast -cp -p server %{buildroot}%{_bindir}/llamacppclblastserver -cp -p simple %{buildroot}%{_bindir}/llamacppclblastsimple +cp -p main %{buildroot}%{_bindir}/llamaclblast +cp -p server %{buildroot}%{_bindir}/llamaclblastserver +cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple + +mkdir -p %{buildroot}/usr/lib/systemd/system +%{__cat} < %{buildroot}/usr/lib/systemd/system/llamaclblast.service +[Unit] +Description=Llama.cpp server, CPU only (no GPU support in this build). +After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target + +[Service] +Type=simple +EnvironmentFile=/etc/sysconfig/llama +ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS +ExecReload=/bin/kill -s HUP $MAINPID +Restart=never + +[Install] +WantedBy=default.target +EOF + +mkdir -p %{buildroot}/etc/sysconfig +%{__cat} < %{buildroot}/etc/sysconfig/llama +LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" +EOF %clean rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llamacppclblast -%{_bindir}/llamacppclblastserver -%{_bindir}/llamacppclblastsimple +%{_bindir}/llamaclblast +%{_bindir}/llamaclblastserver +%{_bindir}/llamaclblastsimple +/usr/lib/systemd/system/llamaclblast.service +%config /etc/sysconfig/llama + %pre diff --git a/.devops/lamma-cpp-cublas.srpm.spec b/.devops/llama-cpp-cublas.srpm.spec similarity index 71% rename from .devops/lamma-cpp-cublas.srpm.spec rename to .devops/llama-cpp-cublas.srpm.spec index 75d32fbe7..f847ebb1e 100644 --- a/.devops/lamma-cpp-cublas.srpm.spec +++ b/.devops/llama-cpp-cublas.srpm.spec @@ -13,7 +13,7 @@ # It is up to the user to install the correct vendor-specific support. Name: llama.cpp-cublas -Version: master +Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) License: MIT @@ -40,6 +40,28 @@ cp -p main %{buildroot}%{_bindir}/llamacppcublas cp -p server %{buildroot}%{_bindir}/llamacppcublasserver cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple +mkdir -p %{buildroot}/usr/lib/systemd/system +%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacublas.service +[Unit] +Description=Llama.cpp server, CPU only (no GPU support in this build). +After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target + +[Service] +Type=simple +EnvironmentFile=/etc/sysconfig/llama +ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS +ExecReload=/bin/kill -s HUP $MAINPID +Restart=never + +[Install] +WantedBy=default.target +EOF + +mkdir -p %{buildroot}/etc/sysconfig +%{__cat} < %{buildroot}/etc/sysconfig/llama +LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" +EOF + %clean rm -rf %{buildroot} rm -rf %{_builddir}/* @@ -48,6 +70,8 @@ rm -rf %{_builddir}/* %{_bindir}/llamacppcublas %{_bindir}/llamacppcublasserver %{_bindir}/llamacppcublassimple +/usr/lib/systemd/system/llamacublas.service +%config /etc/sysconfig/llama %pre diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec index c65251a5a..446213d69 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/llama-cpp.srpm.spec @@ -6,6 +6,7 @@ # Notes for llama.cpp: # 1. Tags are currently based on hash - which will not sort asciibetically. # We need to declare standard versioning if people want to sort latest releases. +# In the meantime, YYYYMMDD format will be used. # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies. # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed. # Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo @@ -13,12 +14,13 @@ # It is up to the user to install the correct vendor-specific support. Name: llama.cpp -Version: master +Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) License: MIT Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz -BuildRequires: coreutils make gcc-c++ git +BuildRequires: coreutils make gcc-c++ git libstdc++-devel +Requires: libstdc++ URL: https://github.com/ggerganov/llama.cpp %define debug_package %{nil} @@ -26,27 +28,52 @@ URL: https://github.com/ggerganov/llama.cpp %description CPU inference for Meta's Lllama2 models using default options. +Models are not included in this package and must be downloaded separately. %prep -%autosetup +%setup -n llama.cpp-master %build make -j %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llamacpp -cp -p server %{buildroot}%{_bindir}/llamacppserver -cp -p simple %{buildroot}%{_bindir}/llamacppsimple +cp -p main %{buildroot}%{_bindir}/llama +cp -p server %{buildroot}%{_bindir}/llamaserver +cp -p simple %{buildroot}%{_bindir}/llamasimple + +mkdir -p %{buildroot}/usr/lib/systemd/system +%{__cat} < %{buildroot}/usr/lib/systemd/system/llama.service +[Unit] +Description=Llama.cpp server, CPU only (no GPU support in this build). +After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target + +[Service] +Type=simple +EnvironmentFile=/etc/sysconfig/llama +ExecStart=/usr/bin/llamaserver $LLAMA_ARGS +ExecReload=/bin/kill -s HUP $MAINPID +Restart=never + +[Install] +WantedBy=default.target +EOF + +mkdir -p %{buildroot}/etc/sysconfig +%{__cat} < %{buildroot}/etc/sysconfig/llama +LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin" +EOF %clean rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llamacpp -%{_bindir}/llamacppserver -%{_bindir}/llamacppsimple +%{_bindir}/llama +%{_bindir}/llamaserver +%{_bindir}/llamasimple +/usr/lib/systemd/system/llama.service +%config /etc/sysconfig/llama %pre diff --git a/.gitignore b/.gitignore index e5faab774..7a3f3fff4 100644 --- a/.gitignore +++ b/.gitignore @@ -63,10 +63,13 @@ poetry.toml # Test binaries tests/test-grammar-parser +tests/test-llama-grammar tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling -tests/test-tokenizer-0 +tests/test-tokenizer-0-llama +tests/test-tokenizer-0-falcon +tests/test-tokenizer-1 diff --git a/CMakeLists.txt b/CMakeLists.txt index ba008bcc6..1eae2d670 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -301,7 +301,7 @@ if (LLAMA_METAL) set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h) add_compile_definitions(GGML_USE_METAL) - add_compile_definitions(GGML_METAL_NDEBUG) + #add_compile_definitions(GGML_METAL_NDEBUG) # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") diff --git a/Makefile b/Makefile index a3400e491..02ba3e36d 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam_search # Binaries only useful for tests -TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0 +TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 default: $(BUILD_TARGETS) @@ -305,7 +305,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif # LLAMA_HIPBLAS ifdef LLAMA_METAL - CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG + CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG CXXFLAGS += -DGGML_USE_METAL LDFLAGS += -framework Foundation -framework Metal -framework MetalKit OBJS += ggml-metal.o @@ -356,7 +356,7 @@ OBJS += ggml-alloc.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common.o: common/common.cpp common/common.h +common.o: common/common.cpp common/common.h build-info.h $(CXX) $(CXXFLAGS) -c $< -o $@ console.o: common/console.cpp common/console.h @@ -369,7 +369,7 @@ libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) clean: - rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS) + rm -vf *.o *.so *.dll benchmark-matmult build-info.h $(BUILD_TARGETS) $(TEST_TARGETS) # # Examples @@ -409,18 +409,33 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput -gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS) +gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o common.o $(OBJS) +train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS) +convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +beam_search: examples/beam_search/beam_search.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' +BUILD_TARGETS += metal +endif + +ifdef LLAMA_METAL +metal: examples/metal/metal.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +endif + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ @@ -442,29 +457,35 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) +tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) +tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) +tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/README.md b/README.md index 95471fdbb..a880fd29f 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,10 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ### Hot topics +- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810 + +- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821 + - Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717 - A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398) @@ -109,6 +113,7 @@ as the main playground for developing new features for the [ggml](https://github - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) +- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) **UI:** @@ -724,8 +729,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1 - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML) - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML) - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML) -- Specify `-eps 1e-5` for best generation quality -- Specify `-gqa 8` for 70B models to work ### Verifying the model files diff --git a/ci/run.sh b/ci/run.sh index e1486e7c1..942b2e00c 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 { (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log function check_ppl { qnt="$1" @@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + # lora + function compare_ppl { + qnt="$1" + ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2" + return 20 + fi + + printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2" + return 0 + } + + path_lora="../models-mnt/open-llama/3B-v2/lora" + path_shakespeare="../models-mnt/shakespeare" + + shakespeare="${path_shakespeare}/shakespeare.txt" + lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt + + python3 ../convert-lora-to-ggml.py ${path_lora} + + # f16 + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + f16 lora-base + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + set +e } @@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 { gg_printf 'OpenLLaMA 3B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" @@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" + gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" + gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" + gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)" + gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } # open_llama_7b_v2 @@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 { ./bin/quantize ${model_f16} ${model_q5_k} q5_k ./bin/quantize ${model_f16} ${model_q6_k} q6_k - (time ./bin/main --model ${model_f16} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log @@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + # lora + function compare_ppl { + qnt="$1" + ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1) + + if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then + printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2" + return 20 + fi + + printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2" + return 0 + } + + path_lora="../models-mnt/open-llama/7B-v2/lora" + path_shakespeare="../models-mnt/shakespeare" + + shakespeare="${path_shakespeare}/shakespeare.txt" + lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt + + python3 ../convert-lora-to-ggml.py ${path_lora} + + # f16 + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # currently not supported by the CUDA backend + # q8_0 + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + + # q8_0 + f16 lora-base + #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + set +e } @@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 { gg_printf 'OpenLLaMA 7B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)" @@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 { gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)" gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)" gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)" + gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)" + gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)" + #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)" + #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)" + #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } ## main diff --git a/common/common.cpp b/common/common.cpp index ff19ec4e5..90fe2e84e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1,15 +1,21 @@ #include "common.h" +#include "build-info.h" +#include "llama.h" -#include -#include -#include -#include -#include -#include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -19,11 +25,14 @@ #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN #define NOMINMAX +#include +#include #include #include #include #else #include +#include #include #endif @@ -93,7 +102,6 @@ void process_escapes(std::string& input) { bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; - bool escape_prompt = false; std::string arg; gpt_params default_params; const std::string arg_prefix = "--"; @@ -125,8 +133,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.prompt = argv[i]; - } else if (arg == "-e") { - escape_prompt = true; + } else if (arg == "-e" || arg == "--escape") { + params.escape = true; } else if (arg == "--prompt-cache") { if (++i >= argc) { invalid_param = true; @@ -415,6 +423,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.antiprompt.push_back(argv[i]); + } else if (arg == "-ld" || arg == "--logdir") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.logdir = argv[i]; + + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } } else if (arg == "--perplexity") { params.perplexity = true; } else if (arg == "--ppl-stride") { @@ -520,7 +538,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { exit(1); } - if (escape_prompt) { + if (params.escape) { process_escapes(params.prompt); process_escapes(params.input_prefix); process_escapes(params.input_suffix); @@ -546,7 +564,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); fprintf(stdout, " prompt to start generation with (default: empty)\n"); - fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + fprintf(stdout, " -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); fprintf(stdout, " not supported with --interactive or other interactive options\n"); @@ -627,6 +645,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stdout, " -m FNAME, --model FNAME\n"); fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); + fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n"); + fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n"); fprintf(stdout, "\n"); } @@ -733,12 +753,12 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -746,3 +766,322 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok return std::string(result.data(), result.size()); } + +std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { + const llama_token bos_id = llama_token_bos(ctx); + + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + // remove the leading space of the first non-BOS token + if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') { + piece = piece.substr(1); + } + + result += piece; + } + + return result; +} + +std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & tokens) { + std::string piece; + std::string result; + + for (size_t i = 0; i < tokens.size(); ++i) { + piece = llama_token_to_piece(ctx, tokens[i]); + + result += piece; + } + + return result; +} + +// returns true if successful, false otherwise +bool create_directory_with_parents(const std::string & path) { +#ifdef _WIN32 + std::wstring_convert> converter; + std::wstring wpath = converter.from_bytes(path); + + // if the path already exists, check whether it's a directory + const DWORD attributes = GetFileAttributesW(wpath.c_str()); + if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + return true; + } + + size_t pos_slash = 0; + + // process path from front to back, procedurally creating directories + while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) { + const std::wstring subpath = wpath.substr(0, pos_slash); + const wchar_t * test = subpath.c_str(); + + const bool success = CreateDirectoryW(test, NULL); + if (!success) { + const DWORD error = GetLastError(); + + // if the path already exists, ensure that it's a directory + if (error == ERROR_ALREADY_EXISTS) { + const DWORD attributes = GetFileAttributesW(subpath.c_str()); + if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) { + return false; + } + } else { + return false; + } + } + + pos_slash += 1; + } + + return true; +#else + // if the path already exists, check whether it's a directory + struct stat info; + if (stat(path.c_str(), &info) == 0) { + return S_ISDIR(info.st_mode); + } + + size_t pos_slash = 1; // skip leading slashes for directory creation + + // process path from front to back, procedurally creating directories + while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) { + const std::string subpath = path.substr(0, pos_slash); + struct stat info; + + // if the path already exists, ensure that it's a directory + if (stat(subpath.c_str(), &info) == 0) { + if (!S_ISDIR(info.st_mode)) { + return false; + } + } else { + // create parent directories + const int ret = mkdir(subpath.c_str(), 0755); + if (ret != 0) { + return false; + } + } + + pos_slash += 1; + } + + return true; +#endif // _WIN32 +} + +void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%e, ", data[i]); + } + fprintf(stream, "%e]\n", data.back()); +} + +void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data) { + if (data.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + fprintf(stream, "%s: [", prop_name); + for (size_t i = 0; i < data.size() - 1; ++i) { + fprintf(stream, "%d, ", data[i]); + } + fprintf(stream, "%d]\n", data.back()); +} + +void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) { + std::string data_str(data == NULL ? "" : data); + + if (data_str.empty()) { + fprintf(stream, "%s:\n", prop_name); + return; + } + + size_t pos_start = 0; + size_t pos_found = 0; + + if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) { + data_str = std::regex_replace(data_str, std::regex("\n"), "\\n"); + data_str = std::regex_replace(data_str, std::regex("\""), "\\\""); + data_str = "\"" + data_str + "\""; + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + if (data_str.find('\n') == std::string::npos) { + fprintf(stream, "%s: %s\n", prop_name, data_str.c_str()); + return; + } + + fprintf(stream, "%s: |\n", prop_name); + while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) { + fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str()); + pos_start = pos_found + 1; + } +} + +std::string get_sortable_timestamp() { + using clock = std::chrono::system_clock; + + const clock::time_point current_time = clock::now(); + const time_t as_time_t = clock::to_time_t(current_time); + char timestamp_no_ns[100]; + std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t)); + + const int64_t ns = std::chrono::duration_cast( + current_time.time_since_epoch() % 1000000000).count(); + char timestamp_ns[11]; + snprintf(timestamp_ns, 11, "%09" PRId64, ns); + + return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); +} + +void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); + fprintf(stream, "build_number: %d\n", BUILD_NUMBER); + fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false"); + fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); + fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); + fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); + fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); + fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); + fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); + fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); + fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); + fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); + fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + +#ifdef NDEBUG + fprintf(stream, "debug: false\n"); +#else + fprintf(stream, "debug: true\n"); +#endif // NDEBUG + + fprintf(stream, "model_desc: %s\n", model_desc); + fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx)); + +#ifdef __OPTIMIZE__ + fprintf(stream, "optimize: true\n"); +#else + fprintf(stream, "optimize: false\n"); +#endif // __OPTIMIZE__ + + fprintf(stream, "time: %s\n", timestamp.c_str()); + + fprintf(stream, "\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "# User Inputs #\n"); + fprintf(stream, "###############\n"); + fprintf(stream, "\n"); + + fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); + fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); + dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str()); + fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale); + fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); + fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); + fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); + fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); + fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false"); + fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); + fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty); + dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); + fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); + fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); + fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks); + + const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx)); + const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY; + fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); + + dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str()); + fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); + dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str()); + fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); + fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); + fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); + fprintf(stream, "keep: %d # default: 0\n", params.n_keep); + fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); + + fprintf(stream, "logit_bias:\n"); + for (std::pair lb : params.logit_bias) { + if (ignore_eos && lb.first == logit_bias_eos->first) { + continue; + } + fprintf(stream, " %d: %f", lb.first, lb.second); + } + + fprintf(stream, "lora: %s\n", params.lora_adapter.c_str()); + fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); + fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false"); + fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); + fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat); + fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau); + fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta); + fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); + fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); + fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false"); + fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); + fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers); + fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); + fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs); + fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); + fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); + fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false"); + fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); + fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); + fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); + fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty); + dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str()); + fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); + fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); + fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); + dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens); + fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); + fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty); + + fprintf(stream, "reverse_prompt:\n"); + for (std::string ap : params.antiprompt) { + size_t pos = 0; + while ((pos = ap.find('\n', pos)) != std::string::npos) { + ap.replace(pos, 1, "\\n"); + pos += 1; + } + + fprintf(stream, " - %s\n", ap.c_str()); + } + + fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); + fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); + fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); + fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); + fprintf(stream, "temp: %f # default: 0.8\n", params.temp); + + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); + dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); + + fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z); + fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "top_k: %d # default: 40\n", params.top_k); + fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p); + fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p); + fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); +} diff --git a/common/common.h b/common/common.h index ce61265f8..c15373144 100644 --- a/common/common.h +++ b/common/common.h @@ -11,6 +11,12 @@ #include #include +#ifdef _WIN32 +#define DIRECTORY_SEPARATOR '\\' +#else +#define DIRECTORY_SEPARATOR '/' +#endif // _WIN32 + // // CLI argument parsing // @@ -61,6 +67,7 @@ struct gpt_params { std::string input_suffix = ""; // string to suffix user inputs with std::string grammar = ""; // optional BNF-like grammar to constrain sampling std::vector antiprompt; // string upon seeing which more user input is prompted + std::string logdir = ""; // directory in which to save YAML log files std::string lora_adapter = ""; // lora adapter path std::string lora_base = ""; // base model path for the lora adapter @@ -82,6 +89,7 @@ struct gpt_params { bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it bool embedding = false; // get only sentence embedding + bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\" bool interactive_first = false; // wait for user input immediately bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles @@ -116,11 +124,41 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param // Vocab utils // +// tokenizes a string into a vector of tokens +// should work similar to Python's `tokenizer.encode` std::vector llama_tokenize( struct llama_context * ctx, const std::string & text, bool add_bos); -std::string llama_token_to_str( +// tokenizes a token into a piece +// should work similar to Python's `tokenizer.id_to_piece` +std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token); + +// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function +// that takes into account the tokenizer type and decides how to handle the leading space +// +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +// removes the leading space from the first non-BOS token +std::string llama_detokenize_spm( + llama_context * ctx, + const std::vector & tokens); + +// detokenizes a vector of tokens into a string +// should work similar to Python's `tokenizer.decode` +std::string llama_detokenize_bpe( + llama_context * ctx, + const std::vector & tokens); + +bool create_directory_with_parents(const std::string & path); +void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data); +void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data); +void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data); +std::string get_sortable_timestamp(); + +void dump_non_result_info_yaml( + FILE * stream, const gpt_params & params, const llama_context * lctx, + const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 411cbf682..168bcf17f 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -48,7 +48,7 @@ def count_model_parts(dir_model: str) -> int: if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(f"Usage: python {sys.argv[0]} dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16") sys.exit(1) diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index 6eeff5bb1..d9c42d76b 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -50,7 +50,7 @@ def count_model_parts(dir_model: str) -> int: if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(f"Usage: python {sys.argv[0]} dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16") sys.exit(1) diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py index f103f5f61..2ab082383 100755 --- a/convert-llama-7b-pth-to-gguf.py +++ b/convert-llama-7b-pth-to-gguf.py @@ -32,7 +32,7 @@ def count_model_parts(dir_model: str) -> int: if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(f"Usage: python {sys.argv[0]} dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16") diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py index 08fde238b..b00810dbb 100755 --- a/convert-llama-hf-to-gguf.py +++ b/convert-llama-hf-to-gguf.py @@ -44,7 +44,7 @@ def count_model_parts(dir_model: str) -> int: if len(sys.argv) < 3: - print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(f"Usage: python {sys.argv[0]} dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16") diff --git a/convert.py b/convert.py index 4f3e92798..3f0a1c932 100755 --- a/convert.py +++ b/convert.py @@ -3,6 +3,7 @@ import gguf import argparse import concurrent.futures +from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor import copy import enum import faulthandler @@ -17,13 +18,14 @@ import re import signal import struct import sys +import time import zipfile import numpy as np from abc import ABCMeta, abstractmethod from dataclasses import dataclass from pathlib import Path -from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Literal, Optional, Sequence, Tuple, TypeVar, Union) +from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union) from sentencepiece import SentencePieceProcessor # type: ignore if TYPE_CHECKING: @@ -37,30 +39,70 @@ NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' ARCH=gguf.MODEL_ARCH.LLAMA NAMES=gguf.MODEL_TENSOR_NAMES[ARCH] +DEFAULT_CONCURRENCY = 8 # # data types # @dataclass(frozen=True) -class UnquantizedDataType: +class DataType: name: str + dtype: 'np.dtype[Any]' + valid_conversions: List[str] -DT_F16 = UnquantizedDataType('F16') -DT_F32 = UnquantizedDataType('F32') -DT_I32 = UnquantizedDataType('I32') -DT_BF16 = UnquantizedDataType('BF16') + def elements_to_bytes(self, n_elements: int) -> int: + return n_elements * self.dtype.itemsize -DataType = Union[UnquantizedDataType] +@dataclass(frozen=True) +class UnquantizedDataType(DataType): + pass -DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = { - DT_BF16: np.dtype(np.uint16), - DT_F16: np.dtype(np.float16), - DT_F32: np.dtype(np.float32), - DT_I32: np.dtype(np.int32), -} +DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) +DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) +DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) +DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) -NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = \ - {dtype: data_type for (data_type, dtype) in DATA_TYPE_TO_NUMPY.items()} +@dataclass(frozen=True) +class QuantizedDataType(DataType): + block_size: int + quantized_dtype: 'np.dtype[Any]' + ggml_type: gguf.GGMLQuantizationType + + def quantize(self, arr: NDArray) -> NDArray: + raise NotImplementedError(f'Quantization for {self.name} not implemented') + + def elements_to_bytes(self, n_elements: int) -> int: + assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' + return self.quantized_dtype.itemsize * (n_elements // self.block_size) + +@dataclass(frozen=True) +class Q8_0QuantizedDataType(QuantizedDataType): + # Mini Q8_0 quantization in Python! + def quantize(self, arr: NDArray) -> NDArray: + assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}' + assert arr.dtype == np.float32, f'Bad array type {arr.dtype}' + n_blocks = arr.size // self.block_size + blocks = arr.reshape((n_blocks, self.block_size)) + # Much faster implementation of block quantization contributed by @Cebtenzzre + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[Tuple[Any, Any]]: + d = abs(blocks).max(axis = 1) / np.float32(127) + with np.errstate(divide = 'ignore'): + qs = (blocks / d[:, None]).round() + qs[d == 0] = 0 + yield from zip(d, qs) + return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) + +DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', + dtype = np.dtype(np.float32), valid_conversions = [], + ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, + quantized_dtype = np.dtype([('d', ' DataType: - if len(tensor.shape) == 1: - # 1D tensors are always F32. - return DT_F32 - elif self == GGMLFileType.AllF32: - return DT_F32 - elif self == GGMLFileType.MostlyF16: - return DT_F16 - else: + dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self) + if dt is None: raise ValueError(self) + # 1D tensors are always F32. + return dt if len(tensor.shape) > 1 else DT_F32 +GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = { + GGMLFileType.AllF32 : DT_F32, + GGMLFileType.MostlyF16 : DT_F16, + GGMLFileType.MostlyQ8_0: DT_Q8_0, +} # # hparams loading @@ -170,7 +214,8 @@ class Params: f_norm_eps = config["rms_norm_eps"] f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None - if "rope_scaling" in config and config["rope_scaling"].get("type") == "linear": + rope_scaling = config.get("rope_scaling") + if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear": f_rope_scale = config["rope_scaling"].get("factor") else: f_rope_scale = None @@ -414,7 +459,7 @@ class UnquantizedTensor(Tensor): self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] def astype(self, data_type: DataType) -> Tensor: - dtype = DATA_TYPE_TO_NUMPY[data_type] + dtype = data_type.dtype if self.data_type == DT_BF16: self.ndarray = bf16_to_fp32(self.ndarray) return UnquantizedTensor(self.ndarray.astype(dtype)) @@ -424,7 +469,7 @@ class UnquantizedTensor(Tensor): def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head)) + return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head)) def part(self, n_part: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 @@ -453,22 +498,6 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv GGMLCompatibleTensor = Union[UnquantizedTensor] -class DeferredPermutedTensor(Tensor): - def __init__(self, base: Tensor, n_head: int, n_head_kv: int) -> None: - self.base = base - self.n_head = n_head - self.data_type = self.base.data_type - - def astype(self, data_type: DataType) -> Tensor: - return self.base.astype(data_type).permute(self.n_head, self.n_head_kv) - - def to_ggml(self) -> GGMLCompatibleTensor: - return self.base.to_ggml().permute(self.n_head, self.n_head_kv) - - def permute(self, n_head: int, n_head_kv: int) -> Tensor: - raise Exception("shouldn't permute twice") - - @dataclass class LazyTensor: _load: Callable[[], Tensor] @@ -478,7 +507,9 @@ class LazyTensor: def load(self) -> Tensor: ret = self._load() - assert ret.data_type == self.data_type, (self.data_type, ret.data_type, self.description) + # Should be okay if it maps to the same numpy type? + assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ + (self.data_type, ret.data_type, self.description) return ret def astype(self, data_type: DataType) -> 'LazyTensor': @@ -489,8 +520,8 @@ class LazyTensor: return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}') def validate_conversion_to(self, data_type: DataType) -> None: - if data_type == self.data_type: - return + if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions: + raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') LazyModel = Dict[str, LazyTensor] @@ -616,9 +647,7 @@ class LazyUnpickler(pickle.Unpickler): info = self.zip_file.getinfo(filename) def load(offset: int, elm_count: int) -> NDArray: - dtype = DATA_TYPE_TO_NUMPY.get(data_type) - if dtype is None: - raise Exception("tensor stored in unsupported format") + dtype = data_type.dtype fp = self.zip_file.open(info) fp.seek(offset * dtype.itemsize) size = elm_count * dtype.itemsize @@ -682,7 +711,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: def convert(info: Dict[str, Any]) -> LazyTensor: data_type = SAFETENSORS_DATA_TYPES[info['dtype']] - numpy_dtype = DATA_TYPE_TO_NUMPY[data_type] + numpy_dtype = data_type.dtype shape: List[int] = info['shape'] begin, end = info['data_offsets'] assert 0 <= begin <= end <= len(byte_buf) @@ -722,23 +751,35 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') -def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]: +def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than letting results pile up in memory. Specifically, there is a max of one output value buffered per thread.''' - with concurrent.futures.ThreadPoolExecutor() as executor: + if concurrency < 2: + yield from map(func, iterable) + # Not reached. + iterable = iter(iterable) + with factory(max_workers = max_workers) as executor: futures: List[concurrent.futures.Future[Out]] = [] - items_rev = list(iterable)[::-1] - for i in range(min(concurrency, len(items_rev))): - futures.append(executor.submit(func, items_rev.pop())) + done = False + for _ in range(concurrency): + try: + futures.append(executor.submit(func, next(iterable))) + except StopIteration: + done = True + break + while futures: result = futures.pop(0).result() - if items_rev: - futures.append(executor.submit(func, items_rev.pop())) + while not done and len(futures) < concurrency: + try: + futures.append(executor.submit(func, next(iterable))) + except StopIteration: + done = True + break yield result - def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) @@ -803,12 +844,11 @@ class OutputFile: self.gguf.add_token_types(toktypes) def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: - n_elements = 1 - for dim in tensor.shape: - n_elements *= dim - data_type = DATA_TYPE_TO_NUMPY[tensor.data_type] - data_nbytes = n_elements * data_type.itemsize - self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes) + n_elements = int(np.prod(tensor.shape)) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype + data_nbytes = tensor.data_type.elements_to_bytes(n_elements) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype) def write_meta(self) -> None: self.gguf.write_header_to_file() @@ -834,7 +874,20 @@ class OutputFile: of.close() @staticmethod - def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None: + def do_item(item: Tuple[str, LazyTensor]) -> Tuple[DataType, NDArray]: + name, lazy_tensor = item + tensor = lazy_tensor.load().to_ggml() + return (lazy_tensor.data_type, tensor.ndarray) + + @staticmethod + def maybe_do_quantize(item: Tuple[DataType, NDArray]) -> NDArray: + dt, arr = item + if not isinstance(dt, QuantizedDataType): + return arr + return dt.quantize(arr) + + @staticmethod + def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) @@ -850,16 +903,19 @@ class OutputFile: of.write_meta() of.write_tensor_info() - def do_item(item: Tuple[str, LazyTensor]) -> NDArray: - name, lazy_tensor = item - return lazy_tensor.load().to_ggml().ndarray - # tensor data - ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) + if ftype == GGMLFileType.MostlyQ8_0: + ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor) + else: + ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) + + start = time.time() for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + elapsed = time.time() - start size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}") + print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}") of.gguf.write_tensor_data(ndarray) of.close() @@ -871,6 +927,8 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi return GGMLFileType.AllF32 if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): return GGMLFileType.MostlyF16 + if output_type_str == "q8_0": + return GGMLFileType.MostlyQ8_0 name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} @@ -894,9 +952,10 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) + tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) + del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] else: break @@ -917,7 +976,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: print(f"skipping tensor {name_new}") continue else: - print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type} | {lazy_tensor.shape}") + print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") out[name_new] = lazy_tensor return out @@ -1022,6 +1081,7 @@ def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", + GGMLFileType.MostlyQ8_0:"q8_0", }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: @@ -1045,12 +1105,13 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outtype", choices=["f32", "f16"], help="output format (default: based on input)") + parser.add_argument("--outtype", choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) args = parser.parse_args(args_in) if args.dump_single: @@ -1072,6 +1133,7 @@ def main(args_in: Optional[List[str]] = None) -> None: params.ftype = { "f32": GGMLFileType.AllF32, "f16": GGMLFileType.MostlyF16, + "q8_0": GGMLFileType.MostlyQ8_0, }[args.outtype] print(f"params = {params}") @@ -1103,7 +1165,7 @@ def main(args_in: Optional[List[str]] = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, params, model, vocab) + OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency) print(f"Wrote {outfile}") diff --git a/examples/beam_search/beam_search.cpp b/examples/beam_search/beam_search.cpp index 1c04fabc2..42c7c7254 100644 --- a/examples/beam_search/beam_search.cpp +++ b/examples/beam_search/beam_search.cpp @@ -35,7 +35,7 @@ struct ostream_beam_view { std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) { os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { - os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]); + os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); } return os << ')'; } @@ -156,7 +156,7 @@ int main(int argc, char ** argv) for( auto id : tokens_list ) { - std::cout << llama_token_to_str(ctx, id); + std::cout << llama_token_to_piece(ctx, id); } std::cout << std::flush; @@ -175,7 +175,7 @@ int main(int argc, char ** argv) std::cout << "\n\n"; for (llama_token const token_id : callback_data.response) { - std::cout << llama_token_to_str(ctx,token_id); + std::cout << llama_token_to_piece(ctx,token_id); } std::cout << std::endl; diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index fd561fcbc..0f37d295b 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -12,18 +12,14 @@ usage: ./convert-llama2c-to-ggml [options] options: -h, --help show this help message and exit - --copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin') + --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf') --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') ``` An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows: -`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin` - -For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually: - -`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin` +`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` Now you can use the model with a command like: diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index f8a58dc7a..e9e070b1f 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -10,9 +10,48 @@ #include #include #include +#include #include #include +// GGUF keys & tensor names. + +#define KV_GENERAL_ARCHITECTURE "general.architecture" +#define KV_GENERAL_NAME "general.name" + +#define KV_TOKENIZER_MODEL "tokenizer.ggml.model" +#define KV_TOKENIZER_LIST "tokenizer.ggml.tokens" +#define KV_TOKENIZER_TOKEN_TYPE "tokenizer.ggml.token_type" +#define KV_TOKENIZER_SCORES "tokenizer.ggml.scores" +#define KV_TOKENIZER_BOS_ID "tokenizer.ggml.bos_token_id" +#define KV_TOKENIZER_EOS_ID "tokenizer.ggml.eos_token_id" +#define KV_TOKENIZER_UNK_ID "tokenizer.ggml.unknown_token_id" +#define KV_TOKENIZER_SEP_ID "tokenizer.ggml.seperator_token_id" +#define KV_TOKENIZER_PAD_ID "tokenizer.ggml.padding_token_id" +#define KV_TOKENIZER_HF_JSON "tokenizer.huggingface.json" + +#define KV_CONTEXT_LENGTH "llama.context_length" +#define KV_EMBEDDING_LENGTH "llama.embedding_length" +#define KV_BLOCK_COUNT "llama.block_count" +#define KV_FEED_FORWARD_LENGTH "llama.feed_forward_length" +#define KV_ATTENTION_HEAD_COUNT "llama.attention.head_count" +#define KV_ATTENTION_HEAD_COUNT_KV "llama.attention.head_count_kv" +#define KV_ATTENTION_LAYERNORM_RMS_EPS "llama.attention.layer_norm_rms_epsilon" +#define KV_ROPE_DIMENSION_COUNT "llama.rope.dimension_count" + +#define TN_TOKEN_EMBD "token_embd.weight" +#define TN_OUTPUT_NORM "output_norm.weight" +#define TN_OUTPUT "output.weight" +#define TN_ATTN_NORM "blk.%d.attn_norm.weight" +#define TN_ATTN_Q "blk.%d.attn_q.weight" +#define TN_ATTN_K "blk.%d.attn_k.weight" +#define TN_ATTN_V "blk.%d.attn_v.weight" +#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight" +#define TN_FFN_NORM "blk.%d.ffn_norm.weight" +#define TN_FFN_GATE "blk.%d.ffn_gate.weight" +#define TN_FFN_DOWN "blk.%d.ffn_down.weight" +#define TN_FFN_UP "blk.%d.ffn_up.weight" + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -20,6 +59,11 @@ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' #define LLAMA_FILE_VERSION_GGJT_V3 3 +#define TOKENIZER_NAME "llama" +#define UNKNOWN_TOKEN_ID 0 +#define BOS_TOKEN_ID 1 +#define EOS_TOKEN_ID 2 + //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. typedef struct { int dim; // transformer dimension @@ -183,6 +227,7 @@ struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; // this is provided as user input? uint32_t n_embd = 4096; + uint32_t n_ff = 11008; uint32_t n_mult = 4; uint32_t n_head = 32; uint32_t n_layer = 32; @@ -214,6 +259,8 @@ struct my_llama_layer { struct my_llama_model { struct ggml_context * ctx = NULL; + std::string name; + my_llama_hparams hparams; struct ggml_tensor * tok_embeddings; @@ -276,18 +323,13 @@ struct train_params { int mem_compute1_gb; }; -uint32_t get_n_ff(const struct my_llama_hparams* hparams) { - const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; - return n_ff; -} - void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); printf("%s: n_mult: %d\n", __func__, params->n_mult); printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_ff: %d\n", __func__, params->n_ff); printf("%s: n_layer: %d\n", __func__, params->n_layer); printf("%s: n_rot: %d\n", __func__, params->n_rot); } @@ -299,7 +341,7 @@ void init_model(struct my_llama_model * model) { const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; - const uint32_t n_ff = get_n_ff(&hparams); + const uint32_t n_ff = hparams.n_ff; struct ggml_context * ctx = model->ctx; model->train_its = 0; @@ -481,21 +523,6 @@ struct llama_file { return std::string(chars.data(), len); } - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - ~llama_file() { if (fp) { std::fclose(fp); @@ -503,30 +530,6 @@ struct llama_file { } }; -void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - return; - } - const char * name = ggml_get_name(tensor); - uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->write_raw(tensor->data, ggml_nbytes(tensor)); -} - bool is_ggml_file(const char *filename) { llama_file file(filename, "rb"); if (file.size < 4) { @@ -536,48 +539,96 @@ bool is_ggml_file(const char *filename) { return magic == GGUF_MAGIC; } +static std::string llama_escape_whitespaces(const std::string& text) { + std::ostringstream out; + for (char c : text) { + if (c == ' ') out << "\xe2\x96\x81"; + else out << c; + } + return out.str(); +} + void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { -#pragma message("TODO: implement reading vocabulary using gguf") -// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary -// if (is_ggml_file(filename)) { -// -// struct llama_context_params llama_params = llama_context_default_params(); -// llama_params.vocab_only = true; -// -// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params); -// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); -// -// const int n_vocab = llama_n_vocab(lctx); -// vocab->id_to_token.resize(n_vocab); -// for (int i=0; iid_to_token[i].text = llama_token_get_text(lctx, i); -// vocab->id_to_token[i].score = llama_token_get_score(lctx, i); -// vocab->id_to_token[i].type = llama_token_get_type(lctx, i); -// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i); -// } -// llama_free(lctx); -// llama_free_model(lmodel); -// } else - { // assume llama2.c vocabulary - printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename); + if (is_ggml_file(filename)) { + struct ggml_context * ctx_data = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + }; + + struct gguf_context * ctx = gguf_init_from_file(filename, params); + GGML_ASSERT(ctx != NULL); + + const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL); + GGML_ASSERT(model_idx >= 0); + std::string tokenizer_name = gguf_get_val_str(ctx, model_idx); + GGML_ASSERT(tokenizer_name == TOKENIZER_NAME); + + const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST); + GGML_ASSERT(token_idx >= 0); + + const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES); + GGML_ASSERT(score_idx >= 0); + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE); + GGML_ASSERT(toktype_idx >= 0); + const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + + vocab->id_to_token.resize(n_vocab); + + for (uint32_t i = 0; i < n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab->token_to_id[word] = i; + + auto & token_data = vocab->id_to_token[i]; + token_data.text = std::move(word); + token_data.score = scores[i]; + token_data.type = (llama_token_type) toktypes[i]; + } + ggml_free(ctx_data); + gguf_free(ctx); + } else { + // assume llama2.c vocabulary + printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); llama_file file(filename, "rb"); const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); - for (int i=0; i single byte tokens. - char byte_val; - if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) { - char cstr[2] = { byte_val, 0 }; - text = cstr; + + unsigned char byte_val; + llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL; + if (id == UNKNOWN_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_UNKNOWN; + } else if (id == BOS_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (id == EOS_TOKEN_ID) { + text = ""; + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (text.empty()) { + type = LLAMA_TOKEN_TYPE_CONTROL; + } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) { + // Text of byte tokens is already in the expected format. + type = LLAMA_TOKEN_TYPE_BYTE; + } else { + type = LLAMA_TOKEN_TYPE_NORMAL; } - vocab->id_to_token[i].text = text; - vocab->id_to_token[i].score = score; - vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED; - vocab->token_to_id.emplace(text, i); + text = llama_escape_whitespaces(text); + + vocab->id_to_token[id].text = text; + vocab->id_to_token[id].score = score; + vocab->id_to_token[id].type = type; + vocab->token_to_id.emplace(text, id); } } } @@ -619,33 +670,6 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * kar } void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } - -#pragma message("TODO: implement file saving using gguf") - // write_magic - file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic - file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version - // write_hparams - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - file.write_u32(LLAMA_FTYPE_ALL_F32); - - // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. - uint32_t n_vocab = model->hparams.n_vocab; - for (uint32_t i = 0; i < n_vocab; i++) { - const auto & token_data = vocab->id_to_token.at(i); - file.write_u32((uint32_t) token_data.text.size()); - file.write_raw(token_data.text.data(), token_data.text.size()); - file.write_raw(&token_data.score, sizeof(token_data.score)); - } - // stuff AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor @@ -657,9 +681,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod // for rms-att-weight int row_length = model->hparams.n_embd; - const auto & hparams = model->hparams; - //int n_ff = model->hparams.n_embd; - int n_ff = get_n_ff(&hparams); + int n_ff = model->hparams.n_ff; for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; @@ -677,28 +699,91 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); } + + struct gguf_context * ctx = gguf_init_empty(); + + std::vector tokens; + std::vector scores; + std::vector token_types; + for (const llama_vocab::token_data & token_data : vocab->id_to_token) { + tokens.push_back(token_data.text.c_str()); + scores.push_back(token_data.score); + token_types.push_back(token_data.type); + } + gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size()); + gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size()); + gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size()); + + gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME); + + gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama"); + gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama"); + + // special tokens + gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID); + gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1); + gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1); + + gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx); + gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); + gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); + // n_head_kv is optional, default to n_head + // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...); + gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer); + gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot); + gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); + // write tensors - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); // ? + ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD); + gguf_add_tensor(ctx, model->tok_embeddings); + + ggml_set_name(model->norm, TN_OUTPUT_NORM); + gguf_add_tensor(ctx, model->norm); + + ggml_set_name(model->output, TN_OUTPUT); + gguf_add_tensor(ctx, model->output); + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); + ggml_format_name(layer.wq, TN_ATTN_Q, i); + gguf_add_tensor(ctx, layer.wq); + + ggml_format_name(layer.wk, TN_ATTN_K, i); + gguf_add_tensor(ctx, layer.wk); + + ggml_format_name(layer.wv, TN_ATTN_V, i); + gguf_add_tensor(ctx, layer.wv); + + ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i); + gguf_add_tensor(ctx, layer.wo); + + ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i); + gguf_add_tensor(ctx, layer.attention_norm); + + ggml_format_name(layer.w1, TN_FFN_GATE, i); + gguf_add_tensor(ctx, layer.w1); + + ggml_format_name(layer.w2, TN_FFN_DOWN, i); + gguf_add_tensor(ctx, layer.w2); + + ggml_format_name(layer.w3, TN_FFN_UP, i); + gguf_add_tensor(ctx, layer.w3); + + ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i); + gguf_add_tensor(ctx, layer.ffn_norm); } + + gguf_write_to_file(ctx, filename, false); + gguf_free(ctx); } struct train_params get_default_train_params() { struct train_params params; - params.fn_vocab_model = "tokenizer.bin"; + params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_train_data = "shakespeare.txt"; params.fn_checkpoint_in = "checkpoint.bin"; @@ -751,7 +836,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params) fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model); + fprintf(stderr, " --copy-vocab-from-model FNAME path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model); fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); fprintf(stderr, "\n"); @@ -812,6 +897,14 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { return true; } +std::string basename(const std::string &path) { + size_t pos = path.find_last_of("/"); + if (pos == std::string::npos) { + return path; + } + return path.substr(pos + 1); +} + int main(int argc, char ** argv) { struct train_params params = get_default_train_params(); if (!params_parse(argc, argv, ¶ms)) { @@ -840,6 +933,7 @@ int main(int argc, char ** argv) { model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); model.hparams.n_ctx = params.n_ctx; model.hparams.n_embd = config.dim; //params.n_embd; + model.hparams.n_ff = config.hidden_dim; model.hparams.n_mult = 32;//params.n_mult; model.hparams.n_head = config.n_heads; //params.n_head; model.hparams.n_layer = config.n_layers; //params.n_layer; @@ -853,6 +947,7 @@ int main(int argc, char ** argv) { model.ctx = ggml_init(lcparams); init_model(&model); + model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 8a6ad882e..036bdb398 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -214,7 +214,7 @@ const char * sampling(struct MyModel * mymodel) { if (id == llama_token_eos(ctx)) { ret = ""; } else { - ret = llama_token_to_str(ctx, id); + ret = llama_token_to_piece(ctx, id); } eval_id(mymodel, id); return ret.c_str(); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 38395c75b..93d583b5c 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -56,9 +56,6 @@ int main(int argc, char ** argv) { int n_past = 0; - // Add a space in front of the first character to match OG llama tokenizer behavior - params.prompt.insert(0, 1, ' '); - // tokenize the prompt auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); @@ -67,7 +64,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "\n"); } diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt new file mode 100644 index 000000000..7d1806af3 --- /dev/null +++ b/examples/gguf/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gguf) +add_executable(${TARGET} gguf.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index dee00df87..cda517bde 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -30,6 +30,9 @@ bool gguf_ex_write(const std::string & fname) { gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678); gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679); gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f); + gguf_set_val_u64 (ctx, "some.parameter.uint64", 0x123456789abcdef0ull); + gguf_set_val_i64 (ctx, "some.parameter.int64", -0x123456789abcdef1ll); + gguf_set_val_f64 (ctx, "some.parameter.float64", 0.1234567890123456789); gguf_set_val_bool(ctx, "some.parameter.bool", true); gguf_set_val_str (ctx, "some.parameter.string", "hello world"); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d0fe6d90d..bf3a487ab 100755 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -10,7 +13,6 @@ #include #include #include -#include #include #include @@ -916,6 +918,9 @@ static void llama_null_log_callback(enum llama_log_level level, const char * tex } int main(int argc, char ** argv) { + // try to set locale for unicode characters in markdown + setlocale(LC_CTYPE, ".UTF-8"); + #if !defined(NDEBUG) fprintf(stderr, "warning: asserts enabled, performance may be affected\n"); #endif diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cb8747c2b..89cc4f602 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -36,9 +37,57 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static llama_context ** g_ctx; +static llama_context ** g_ctx; +static llama_model ** g_model; +static gpt_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; static bool is_interacting = false; +void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const std::vector input_tokens, const std::string output, const std::vector output_tokens) { + + if (params.logdir.empty()) { + return; + } + + const std::string timestamp = get_sortable_timestamp(); + + const bool success = create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: main\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Generation Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + dump_string_yaml_multiline(logfile, "output", output.c_str()); + dump_vector_int_yaml(logfile, "output_tokens", output_tokens); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) void sigint_handler(int signo) { if (signo == SIGINT) { @@ -48,6 +97,7 @@ void sigint_handler(int signo) { console::cleanup(); printf("\n"); llama_print_timings(*g_ctx); + write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); _exit(130); } } @@ -56,6 +106,7 @@ void sigint_handler(int signo) { int main(int argc, char ** argv) { gpt_params params; + g_params = ¶ms; if (gpt_params_parse(argc, argv, params) == false) { return 1; @@ -116,6 +167,7 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; llama_context * ctx_guidance = NULL; + g_model = &model; g_ctx = &ctx; // load the model and apply lora adapter, if any @@ -189,12 +241,14 @@ int main(int argc, char ** argv) { } } - const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + // Add BOS if SPM tokenizer + const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; // tokenize the prompt std::vector embd_inp; + if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { - embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm); + embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos); } else { embd_inp = session_tokens; } @@ -209,10 +263,9 @@ int main(int argc, char ** argv) { int guidance_offset = 0; int original_prompt_len = 0; if (ctx_guidance) { - params.cfg_negative_prompt.insert(0, 1, ' '); - guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm); + guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, is_spm); + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); original_prompt_len = original_inp.size(); guidance_offset = (int)guidance_inp.size() - original_prompt_len; } @@ -259,7 +312,7 @@ int main(int argc, char ** argv) { } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm); + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos); const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); // in instruct mode, we inject a prefix and a suffix to each input by the user @@ -278,7 +331,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } if (ctx_guidance) { @@ -286,14 +339,14 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str()); + fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); } } if (params.n_keep > 0) { fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { - fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); } fprintf(stderr, "'\n"); } @@ -396,6 +449,10 @@ int main(int argc, char ** argv) { int n_session_consumed = 0; int n_past_guidance = 0; + std::vector input_tokens; g_input_tokens = &input_tokens; + std::vector output_tokens; g_output_tokens = &output_tokens; + std::ostringstream output_ss; g_output_ss = &output_ss; + // the first thing we will do is to output the prompt, so set color accordingly console::set_display(console::prompt); @@ -449,7 +506,7 @@ int main(int argc, char ** argv) { //printf("\n---\n"); //printf("resetting: '"); //for (int i = 0; i < (int) embd.size(); i++) { - // printf("%s", llama_token_to_str(ctx, embd[i])); + // printf("%s", llama_token_to_piece(ctx, embd[i])); //} //printf("'\n"); //printf("\n---\n"); @@ -502,7 +559,7 @@ int main(int argc, char ** argv) { input_size = embd_guidance.size(); //fprintf(stderr, "\n---------------------\n"); //for (int i = 0; i < (int) embd_guidance.size(); i++) { - //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i])); + //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i])); //} //fprintf(stderr, "\n---------------------\n"); } else { @@ -597,7 +654,12 @@ int main(int argc, char ** argv) { last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, last_n_repeat, alpha_frequency, alpha_presence); if (!penalize_nl) { - logits[llama_token_nl(ctx)] = nl_logit; + for (size_t idx = 0; idx < candidates_p.size; idx++) { + if (candidates_p.data[idx].id == llama_token_nl(ctx)) { + candidates_p.data[idx].logit = nl_logit; + break; + } + } } if (grammar != NULL) { @@ -661,7 +723,15 @@ int main(int argc, char ** argv) { // display text if (input_echo) { for (auto id : embd) { - printf("%s", llama_token_to_str(ctx, id).c_str()); + const std::string token_str = llama_token_to_piece(ctx, id); + printf("%s", token_str.c_str()); + + if (embd.size() > 1) { + input_tokens.push_back(id); + } else { + output_tokens.push_back(id); + output_ss << token_str; + } } fflush(stdout); } @@ -677,7 +747,7 @@ int main(int argc, char ** argv) { if (params.antiprompt.size()) { std::string last_output; for (auto id : last_n_tokens) { - last_output += llama_token_to_str(ctx, id); + last_output += llama_token_to_piece(ctx, id); } is_antiprompt = false; @@ -755,6 +825,8 @@ int main(int argc, char ** argv) { printf("%s", params.input_suffix.c_str()); } + const size_t original_size = embd_inp.size(); + // instruct mode: insert instruction prefix if (params.instruct && !is_antiprompt) { n_consumed = embd_inp.size(); @@ -769,6 +841,12 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); } + for (size_t i = original_size; i < embd_inp.size(); ++i) { + const llama_token token = embd_inp[i]; + output_tokens.push_back(token); + output_ss << llama_token_to_piece(ctx, token); + } + n_remain -= line_inp.size(); } @@ -811,6 +889,8 @@ int main(int argc, char ** argv) { } llama_print_timings(ctx); + write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); + if (ctx_guidance) { llama_free(ctx_guidance); } llama_free(ctx); llama_free_model(model); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 18635932b..7c02b6d40 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -3,16 +3,79 @@ #include "build-info.h" #include +#include +#include #include #include -#include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif +struct results_perplexity { + std::vector tokens; + double ppl_value; + std::vector logits; + std::vector probs; +}; + +struct results_log_softmax { + double log_softmax; + float logit; + float prob; +}; + +void write_logfile(const llama_context * ctx, const gpt_params & params, + const llama_model * model, const struct results_perplexity & results) { + + if (params.logdir.empty()) { + return; + } + + if (params.hellaswag) { + fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); + return; + } + + const std::string timestamp = get_sortable_timestamp(); + + const bool success = create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: main\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Perplexity Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + dump_vector_float_yaml(logfile, "logits", results.logits); + fprintf(logfile, "ppl_value: %f\n", results.ppl_value); + dump_vector_float_yaml(logfile, "probs", results.probs); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + std::vector softmax(const std::vector& logits) { std::vector probs(logits.size()); float max_logit = logits[0]; @@ -29,20 +92,20 @@ std::vector softmax(const std::vector& logits) { return probs; } -float log_softmax(int n_vocab, const float * logits, int tok) { +results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { float max_logit = logits[0]; for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]); double sum_exp = 0.0; for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit); - return logits[tok] - max_logit - log(sum_exp); + return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; } -void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector& workers, - double& nll, double& nll2) { +void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, + double & nll, double & nll2, float * logit_history, float * prob_history) { std::mutex mutex; int counter = 0; - auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () { + auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { double local_nll = 0, local_nll2 = 0; while (true) { std::unique_lock lock(mutex); @@ -52,34 +115,51 @@ void process_logits(int n_vocab, const float * logits, const int * tokens, int n break; } lock.unlock(); - double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const double v = -results.log_softmax; local_nll += v; local_nll2 += v*v; + + logit_history[i] = results.logit; + prob_history[i] = results.prob; } }; - for (auto& w : workers) w = std::thread(compute); + for (auto & w : workers) w = std::thread(compute); compute(); - for (auto& w : workers) w.join(); + for (auto & w : workers) w.join(); } -void perplexity_v2(llama_context * ctx, const gpt_params & params) { +results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Output: `perplexity: 13.5106 [114/114]` // BOS tokens will be added for each chunk before eval - if (params.ppl_stride <= 0) { - fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); - return; - } - const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; const bool add_bos = is_spm; fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + if (int(tokens.size()) < 2*params.n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx, + params.n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + return {std::move(tokens), 0., {}, {}}; + } + + std::vector logit_history; + std::vector prob_history; + + logit_history.resize(tokens.size()); + prob_history.resize(tokens.size()); + + if (params.ppl_stride <= 0) { + fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); + return {tokens, -1, logit_history, prob_history}; + } const int calc_chunk = params.n_ctx; @@ -88,7 +168,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) { if (int(tokens.size()) <= calc_chunk) { fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, tokens.size(), params.n_ctx, params.ppl_stride); - return; + return {tokens, -1, logit_history, prob_history}; } const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; @@ -120,7 +200,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) { //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { //fprintf(stderr, "%s : failed to eval\n", __func__); - return; + return {tokens, -1, logit_history, prob_history}; } // save original token and restore it after eval @@ -161,6 +241,8 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) { logits.begin() + (j + 1) * n_vocab); const float prob = softmax(tok_logits)[tokens[start + j + 1]]; + logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]]; + prob_history[start + j + 1] = prob; nll += -std::log(prob); ++count; @@ -174,12 +256,14 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) { fflush(stdout); } printf("\n"); + + return {tokens, std::exp(nll / count), logit_history, prob_history}; } -void perplexity(llama_context * ctx, const gpt_params & params) { +results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { + if (params.ppl_stride > 0) { - perplexity_v2(ctx, params); - return; + return perplexity_v2(ctx, params); } // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research @@ -190,9 +274,26 @@ void perplexity(llama_context * ctx, const gpt_params & params) { const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; const bool add_bos = is_spm; + auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + auto tim2 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + + if (int(tokens.size()) < 2*params.n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx, + params.n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + return {std::move(tokens), 0., {}, {}}; + } + + std::vector logit_history; + logit_history.resize(tokens.size()); + + std::vector prob_history; + prob_history.resize(tokens.size()); const int n_chunk_max = tokens.size() / params.n_ctx; @@ -232,7 +333,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) { if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) { fprintf(stderr, "%s : failed to eval\n", __func__); - return; + return {tokens, -1, logit_history, prob_history}; } // restore the original token in case it was set to BOS @@ -268,7 +369,8 @@ void perplexity(llama_context * ctx, const gpt_params & params) { // last 256 tokens. Then, we split the input up into context window size chunks to // process the entire prompt. const int first = std::min(512, params.n_ctx/2); - process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2); + process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); count += params.n_ctx - first - 1; // perplexity is e^(average negative log-likelihood) @@ -283,16 +385,19 @@ void perplexity(llama_context * ctx, const gpt_params & params) { fflush(stdout); } printf("\n"); + nll2 /= count; nll /= count; + const double ppl = exp(nll); nll2 -= nll * nll; if (nll2 > 0) { nll2 = sqrt(nll2/(count-1)); - double ppl = exp(nll); printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); } else { printf("Unexpected negative standard deviation of log(prob)\n"); } + + return {tokens, ppl, logit_history, prob_history}; } std::vector hellaswag_evaluate_tokens(llama_context * ctx, const std::vector& tokens, int n_past, int n_batch, @@ -351,6 +456,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; + fprintf(stderr, "================================= is_spm = %d\n", is_spm); // This is needed as usual for LLaMA models const bool add_bos = is_spm; @@ -391,7 +497,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_data[i].context = prompt_lines[idx*6]; hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j=0; j < 4; j++) { - hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j]; + hs_data[i].ending[j] = prompt_lines[idx*6+2+j]; } // Delete the selected random example from the prompt @@ -406,6 +512,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { double acc = 0.0f; const int n_vocab = llama_n_vocab(ctx); + std::vector> ending_tokens(4); + std::vector tok_logits(n_vocab); for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { @@ -413,11 +521,21 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { std::vector context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos); size_t context_size = context_embd.size(); + for (int i = 0; i < 4; ++i) { + ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos); + for (int k = 0; k < int(context_size); ++k) { + if (ending_tokens[i][k] != context_embd[k]) { + fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k); + break; + } + } + } + // Do the 1st ending // In this case we include the context when evaluating - auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos); + //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos); + auto query_embd = ending_tokens[0]; auto query_size = query_embd.size(); - //printf("First query: %d\n",(int)query_size); // Stop if query wont fit the ctx window if (query_size > (size_t)params.n_ctx) { @@ -462,7 +580,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) { for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) { // Tokenize the query - query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false); + query_embd.resize(ending_tokens[ending_idx].size() - context_size); + std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int)); query_size = query_embd.size(); // Stop if query wont fit the ctx window @@ -586,13 +705,16 @@ int main(int argc, char ** argv) { params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } + struct results_perplexity results; if (params.hellaswag) { hellaswag_score(ctx, params); } else { - perplexity(ctx, params); + results = perplexity(ctx, params); } llama_print_timings(ctx); + write_logfile(ctx, params, model, results); + llama_free(ctx); llama_free_model(model); diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index d172f645a..df9a214fc 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -100,7 +100,7 @@ int main(int argc, char ** argv) { } } - if (argc - arg_idx < 3) { + if (argc - arg_idx < 2) { usage(argv[0]); } @@ -114,7 +114,7 @@ int main(int argc, char ** argv) { std::string ftype_str; if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { std::string fpath; - const size_t pos = fname_inp.find_last_of('/'); + const size_t pos = fname_inp.find_last_of("/\\"); if (pos != std::string::npos) { fpath = fname_inp.substr(0, pos + 1); } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 3db61b754..573bc4ef9 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -87,7 +87,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx, &candidates_p); - auto next_token_str = llama_token_to_str(ctx, next_token); + auto next_token_str = llama_token_to_piece(ctx, next_token); last_n_tokens_data.push_back(next_token); printf("%s", next_token_str.c_str()); @@ -147,7 +147,7 @@ int main(int argc, char ** argv) { } llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; auto next_token = llama_sample_token(ctx2, &candidates_p); - auto next_token_str = llama_token_to_str(ctx2, next_token); + auto next_token_str = llama_token_to_piece(ctx2, next_token); last_n_tokens_data.push_back(next_token); printf("%s", next_token_str.c_str()); diff --git a/examples/server/README.md b/examples/server/README.md index 77997f98d..517608046 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed. ```bash mkdir llama-client cd llama-client -npm init -npm install axios ``` Create a index.js file and put inside this: ```javascript -const axios = require("axios"); - const prompt = `Building a website can be done in 10 simple steps:`; async function Test() { - let result = await axios.post("http://127.0.0.1:8080/completion", { - prompt, - n_predict: 512, - }); - - // the response is received until completion finish - console.log(result.data.content); + let response = await fetch("http://127.0.0.1:8080/completion", { + method: 'POST', + body: JSON.stringify({ + prompt, + n_predict: 512, + }) + }) + console.log((await response.json()).content) } -Test(); +Test() ``` And run it: ```bash -node . +node index.js ``` ## API Endpoints @@ -167,6 +164,12 @@ node . Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`. +- **POST** `/detokenize`: Convert tokens to text. + + *Options:* + + `tokens`: Set the tokens to detokenize. + - **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. *Options:* diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3300553f9..b485a5ead 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -94,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) std::string ret; for (; begin != end; ++begin) { - ret += llama_token_to_str(ctx, *begin); + ret += llama_token_to_piece(ctx, *begin); } return ret; } @@ -123,7 +123,7 @@ static void server_log(const char *level, const char *function, int line, // format incomplete utf-8 multibyte character for output static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_str(ctx, token); + std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); // if the size is 1 and first bit is 1, meaning it's a partial character // (size > 1 meaning it's already a known token) if (out.size() == 1 && (out[0] & 0x80) == 0x80) @@ -286,7 +286,6 @@ struct llama_server_context std::vector p; if (first) { - s.insert(0, 1, ' '); // add a space if it's the first p = ::llama_tokenize(ctx, s, add_bos); first = false; } @@ -309,7 +308,6 @@ struct llama_server_context else { auto s = json_prompt.template get(); - s.insert(0, 1, ' '); // always add a first space prompt_tokens = ::llama_tokenize(ctx, s, add_bos); } @@ -566,7 +564,7 @@ struct llama_server_context if (!embd.empty() && embd.back() == llama_token_eos(ctx)) { - // stopping_word = llama_token_to_str(ctx, embd.back()); + // stopping_word = llama_token_to_piece(ctx, embd.back()); has_next_token = false; stopped_eos = true; LOG_VERBOSE("eos token found", {}); @@ -613,7 +611,7 @@ struct llama_server_context { const completion_token_output token_with_probs = nextToken(); - const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok); + const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; if (params.n_probs > 0) @@ -721,7 +719,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); + fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); fprintf(stdout, " -nommq, --no-mul-mat-q\n"); fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); @@ -1104,6 +1102,12 @@ static json format_tokenizer_response(const std::vector &tokens) {"tokens", tokens}}; } +static json format_detokenized_response(std::string content) +{ + return json{ + {"content", content}}; +} + template static T json_value(const json &body, const std::string &key, const T &default_value) { @@ -1248,7 +1252,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) { struct token_translator { llama_context * ctx; - std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); } + std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); } }; @@ -1358,7 +1362,7 @@ int main(int argc, char **argv) while (llama.has_next_token) { const completion_token_output token_with_probs = llama.doCompletion(); - const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok); + const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok); stop_pos = llama.findStoppingStrings(llama.generated_text, token_text.size(), STOP_FULL); @@ -1389,7 +1393,7 @@ int main(int argc, char **argv) if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) { continue; } - const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok); + const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok); size_t pos = std::min(sent_count, llama.generated_text.size()); @@ -1501,6 +1505,21 @@ int main(int argc, char **argv) const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); + svr.Post("/detokenize", [&llama](const Request &req, Response &res) + { + auto lock = llama.lock(); + + const json body = json::parse(req.body); + std::string content; + if (body.count("tokens") != 0) + { + const std::vector tokens = body["tokens"]; + content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); + } + + const json data = format_detokenized_response(content); + return res.set_content(data.dump(), "application/json"); }); + svr.Post("/embedding", [&llama](const Request &req, Response &res) { auto lock = llama.lock(); diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 132f7fbf9..4ee85faca 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -63,7 +63,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n\n"); for (auto id : tokens_list) { - fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str()); + fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str()); } fflush(stderr); @@ -112,7 +112,7 @@ int main(int argc, char ** argv) { } // print the new token : - printf("%s", llama_token_to_str(ctx, new_token_id).c_str()); + printf("%s", llama_token_to_piece(ctx, new_token_id).c_str()); fflush(stdout); // push this new token for next evaluation diff --git a/examples/train-text-from-scratch/README.md b/examples/train-text-from-scratch/README.md index 726ec47c0..f4ffcd987 100644 --- a/examples/train-text-from-scratch/README.md +++ b/examples/train-text-from-scratch/README.md @@ -8,15 +8,15 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s # train ./bin/train-text-from-scratch \ - --vocab-model ../models/ggml-vocab.bin \ + --vocab-model ../models/ggml-vocab-llama.gguf \ --ctx 64 --embd 256 --head 8 --layer 16 \ - --checkpoint-in chk-shakespeare-256x16.bin \ - --checkpoint-out chk-shakespeare-256x16.bin \ - --model-out ggml-shakespeare-256x16-f32.bin \ + --checkpoint-in chk-shakespeare-256x16.gguf \ + --checkpoint-out chk-shakespeare-256x16.gguf \ + --model-out ggml-shakespeare-256x16-f32.gguf \ --train-data "shakespeare.txt" \ - -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \ - --print-details-interval 0 --predict 16 --use-flash + -t 6 -b 16 --seed 1 --adam-iter 256 \ + --no-checkpointing # predict -./bin/main -m ggml-shakespeare-256x16-f32.bin +./bin/main -m ggml-shakespeare-256x16-f32.gguf ``` diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py new file mode 100644 index 000000000..01b3ee92a --- /dev/null +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -0,0 +1,492 @@ +#!/usr/bin/env python3 +# train-text-from-scratch checkpoint --> gguf conversion + +import argparse +import gguf +import os +import struct +import sys +import numpy as np +from pathlib import Path + +# gguf constants +LLM_KV_OPTIMIZER_TYPE = "optimizer.type" +LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" +LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" +LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" +LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" +LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" +LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" +LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" +LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" +LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" +LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" +LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" +LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" +LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" +LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" + +LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" +LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" +LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" + +LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" +LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" +LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" +LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" +LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" +LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" +LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" + +LLM_KV_TRAINING_FILE_VERSION = "training.file_version" +LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" +LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" +LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" + +class Tensor: + def __init__(self, dtype='f', ne=None): + if ne is None: + ne = [] + self.dtype = dtype + self.ne = ne + self.nbytes = 0 + if self.dtype == 'f': + if len(self.ne) == 0: + self.nbytes = 0 + else: + self.nbytes = int(np.product(self.ne)) * 4 + else: + raise ValueError(f"Unhandled data type '{self.dtype}'") + + def load(self, data, offset): + nd = struct.unpack(' 0 else []) + + self.lbfgs_x = Tensor('f', [self.nx]) + self.lbfgs_xp = Tensor('f', [self.nx]) + self.lbfgs_g = Tensor('f', [self.nx]) + self.lbfgs_gp = Tensor('f', [self.nx]) + self.lbfgs_d = Tensor('f', [self.nx]) + self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) + self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) + self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) + + if self.type == 0: + # these tensors are stored, but we don't need their data + x = Tensor('f', [self.nx]) + g = Tensor('f', [self.nx]) + g2 = Tensor('f', [self.nx]) + mh = Tensor('f', [self.nx]) + vh = Tensor('f', [self.nx]) + + offset = x.load(data, offset) + offset = g.load(data, offset) + offset = g2.load(data, offset) + offset = self.adam_m.load(data, offset) + offset = self.adam_v.load(data, offset) + offset = mh.load(data, offset) + offset = vh.load(data, offset) + offset = self.adam_pf.load(data, offset) + + self.adam_fx_best = struct.unpack(' 0 else []) + + self.lbfgs_x = Tensor('f', [self.nx]) + self.lbfgs_xp = Tensor('f', [self.nx]) + self.lbfgs_g = Tensor('f', [self.nx]) + self.lbfgs_gp = Tensor('f', [self.nx]) + self.lbfgs_d = Tensor('f', [self.nx]) + self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) + self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) + self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) + self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) + + # forgot to save type in version 1: + # guess self.type from number of remaining bytes + size_type_0 = 12 + sum([t.max_storage_size() for t in + [self.adam_m, self.adam_v] + +([self.adam_pf] if (self.past > 0) else [])]) + size_type_1 = 24 + sum([t.max_storage_size() for t in + [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, + self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, + self.lbfgs_lmal, self.lbfgs_lmys, + self.lbfgs_lms, self.lbfgs_lmy] + +([self.lbfgs_pf] if (self.past > 0) else [])]) + # due to alignment padding the size might not by exact + # but the difference in size for both types is significant, + # so we can just use whichever is closest + remaining = len(data) - offset + if abs(remaining - size_type_0) < abs(remaining - size_type_1): + self.type = 0 + else: + self.type = 1 + + if self.type == 0: + offset = self.adam_m.load(data, offset) + offset = self.adam_v.load(data, offset) + offset = self.adam_pf.load(data,offset) + + self.adam_fx_best = struct.unpack(' 0: + self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) + + elif self.type == 1: + gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) + gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) + gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) + gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) + gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) + gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) + + self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) + self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) + self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) + self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) + self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) + if self.past > 0: + self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) + self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) + self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) + self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) + self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) + else: + raise ValueError('Unknown optimizer type') + +class ModelParams: + def __init__(self): + pass + + def load(self, data, offset): + self.n_vocab = struct.unpack(' @@ -17,8 +18,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const float rms_norm_eps = 1e-5f; - struct random_normal_distribution { std::mt19937 gen; std::normal_distribution rd; @@ -63,17 +62,6 @@ float frand_uniform(struct random_uniform_distribution * rnd) { return rnd->rd(rnd->gen); } -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier switch (tensor->n_dims) { @@ -167,29 +155,20 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc return tensor; } -struct llama_vocab { - using id = int32_t; - using token = std::string; - using ttype = llama_token_type; - - struct token_data { - token text; - float score; - ttype type; - }; - - std::unordered_map token_to_id; - std::vector id_to_token; -}; - struct my_llama_hparams { uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_ctx = 512; uint32_t n_embd = 4096; - uint32_t n_mult = 4; uint32_t n_head = 32; uint32_t n_layer = 32; uint32_t n_rot = 64; + uint32_t n_ff = 11008; + + // float f_norm_eps = 1e-5; // falcon + float f_norm_rms_eps = 1e-5; // llama + + float rope_freq_base = 10000.0f; + float rope_freq_scale = 1.0f; bool operator!=(const my_llama_hparams& other) const { return memcmp(this, &other, sizeof(my_llama_hparams)); @@ -215,17 +194,6 @@ struct my_llama_layer { struct ggml_tensor * w3; }; -struct my_llama_kv_cache { - struct ggml_context * ctx = NULL; - - struct ggml_tensor * k; - struct ggml_tensor * v; - - // llama_ctx_buffer buf; - - int n; // number of tokens currently in the cache -}; - struct my_llama_model { struct ggml_context * ctx = NULL; @@ -243,18 +211,91 @@ struct my_llama_model { uint32_t train_tokens = 0; }; -uint32_t get_n_ff(const struct my_llama_hparams* hparams) { - const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult; - return n_ff; -} +// gguf constants +const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type"; +const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"; +const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"; +const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"; +const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"; +const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"; +const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"; +const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"; +const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"; +const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"; +const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"; +const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"; +const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"; +const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"; +const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"; +const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"; +const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"; +const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"; + +const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"; +const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"; +const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"; + +const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"; +const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"; + +const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version"; +const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"; +const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"; +const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"; + +// gguf constants (sync with gguf.py) + +const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; +const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; + +const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; +const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; +const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; +const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; +const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; +const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; +const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; +const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp +const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; + +const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model"; +const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens"; +const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"; +const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores"; +const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges"; +const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"; +const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"; +const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"; +const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"; +const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"; + +const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; +const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; +const char * LLM_TENSOR_OUTPUT = "output"; +const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; +const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; +const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; +const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; +const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; +const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; +const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; +const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; +const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; void print_params(struct my_llama_hparams * params) { printf("%s: n_vocab: %d\n", __func__, params->n_vocab); printf("%s: n_ctx: %d\n", __func__, params->n_ctx); printf("%s: n_embd: %d\n", __func__, params->n_embd); - printf("%s: n_mult: %d\n", __func__, params->n_mult); printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_ff: %d\n", __func__, params->n_ff); printf("%s: n_layer: %d\n", __func__, params->n_layer); printf("%s: n_rot: %d\n", __func__, params->n_rot); } @@ -265,8 +306,7 @@ void init_model(struct my_llama_model * model) { const uint32_t n_embd = hparams.n_embd; const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; - - const uint32_t n_ff = get_n_ff(&hparams); + const uint32_t n_ff = hparams.n_ff; struct ggml_context * ctx = model->ctx; @@ -274,20 +314,31 @@ void init_model(struct my_llama_model * model) { model->train_samples = 0; model->train_tokens = 0; + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + auto tn = [&tn_buf](const char * key) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); + return tn_buf.data(); + }; + auto tni = [&tn_buf](const char * key, int bid) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); + return tn_buf.data(); + }; + model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); - ggml_set_name(model->norm, "norm.weight"); - ggml_set_name(model->output, "output.weight"); + ggml_set_name(model->tok_embeddings, tn(LLM_TENSOR_TOKEN_EMBD)); + ggml_set_name(model->norm, tn(LLM_TENSOR_OUTPUT_NORM)); + ggml_set_name(model->output, tn(LLM_TENSOR_OUTPUT)); model->layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { auto & layer = model->layers[i]; - std::string layers_i = "layers." + std::to_string(i); - layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); @@ -301,18 +352,18 @@ void init_model(struct my_llama_model * model) { layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str()); + ggml_set_name(layer.attention_norm, tni(LLM_TENSOR_ATTN_NORM, i)); - ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str()); - ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str()); - ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str()); - ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str()); + ggml_set_name(layer.wq, tni(LLM_TENSOR_ATTN_Q, i)); + ggml_set_name(layer.wk, tni(LLM_TENSOR_ATTN_K, i)); + ggml_set_name(layer.wv, tni(LLM_TENSOR_ATTN_V, i)); + ggml_set_name(layer.wo, tni(LLM_TENSOR_ATTN_OUT, i)); - ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str()); + ggml_set_name(layer.ffn_norm, tni(LLM_TENSOR_FFN_NORM, i)); - ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str()); - ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str()); - ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str()); + ggml_set_name(layer.w1, tni(LLM_TENSOR_FFN_GATE, i)); + ggml_set_name(layer.w2, tni(LLM_TENSOR_FFN_DOWN, i)); + ggml_set_name(layer.w3, tni(LLM_TENSOR_FFN_UP, i)); } } @@ -371,267 +422,6 @@ void randomize_model(struct my_llama_model * model, int seed, float mean, float } } -bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) { - const auto & hparams = model->hparams; - - const uint32_t n_ctx = hparams.n_ctx; - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - - const int64_t n_mem = n_layer*n_ctx*n_batch; - const int64_t n_elements = n_embd*n_mem; - - // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); - - // struct ggml_init_params params; - // params.mem_size = cache.buf.size; - // params.mem_buffer = cache.buf.addr; - // params.no_alloc = false; - if (!cache->ctx) { - struct ggml_init_params params; - params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; - params.mem_buffer = NULL; - params.no_alloc = false; - - cache->ctx = ggml_init(params); - - if (!cache->ctx) { - fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); - return false; - } - } - - cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - - return true; -} - -struct ggml_tensor * forward( - struct my_llama_model * model, - struct my_llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - - const int N = n_tokens; - - struct my_llama_kv_cache& kv_self = *cache; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); - - struct ggml_tensor * kc = kv_self.k; - struct ggml_tensor * vc = kv_self.v; - - // inpL shape [n_embd,N,1,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Kcur shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - // wv shape [n_embd, n_embd, 1, 1] - // Vcur shape [n_embd, N, 1, 1] - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // kv_self.v shape [n_embd * n_ctx * n_layer, 1] - // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] - // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] - - /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } //*/ - - kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - vc = ggml_set_2d_inplace(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - } - - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Q shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // K shape [n_embd/n_head, n_past + N, n_head, 1] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - // KQ shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - - // split cached V into n_head heads - //// V shape [n_past + N, n_embd/n_head, n_head, 1] - // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] - struct ggml_tensor * V = - ggml_view_3d(ctx0, vc, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(vc), - n_ctx*ggml_element_size(vc)*n_embd/n_head, - il*n_ctx*ggml_element_size(vc)*n_embd); - - // KQV shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - // KQV_merged shape - - // cur = KQV_merged.contiguous().view(n_embd, N) - // cur shape [n_embd,N,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); - // cur = ggml_cpy(ctx0, - // KQV_merged, - // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection (no bias) - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N,1,1] - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - - // cur = ffn_norm*cur - // cur shape [n_embd,N,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - } - - // tmp shape [n_ff,N,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - - // SILU activation - // cur shape [n_ff,N,1,1] - cur = ggml_silu(ctx0, cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul(ctx0, cur, tmp); - - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - } - - // cur shape [n_embd,N,1,1] - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - // inpL shape [n_embd,N,1,1] - inpL = cur; - } - - // norm - { - - // inpL shape [n_embd,N,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // inpL = norm*inpL - // inpL shape [n_embd,N,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - //embeddings = inpL; - } - - // lm_head - // inpL shape [n_vocab,N,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; -} - void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); @@ -658,786 +448,222 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6 GGML_ASSERT(tensor->ne[3] == ne3); } -struct ggml_tensor * forward_batch( - struct my_llama_model * model, - struct my_llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past, - const int n_batch) { - - const int N = n_tokens; - - struct my_llama_kv_cache& kv_self = *cache; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); - memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); - - struct ggml_tensor * kc = kv_self.k; - struct ggml_tensor * vc = kv_self.v; - - // inpL shape [n_embd,N*n_batch,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - // wv shape [n_embd, n_embd, 1, 1] - // Vcur shape [N, n_embd, n_batch, 1] - struct ggml_tensor * Vcur = ggml_cont(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_mul_mat(ctx0, - model->layers[il].wv, - cur), - n_embd, N, n_batch), - 1, 0, 2, 3)); - assert_shape_3d(Vcur, N, n_embd, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] - // k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il] - // v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il] - - /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } //*/ - - kc = ggml_set_2d_inplace(ctx0, kc, - ggml_reshape_2d(ctx0, Kcur, n_embd*N, n_batch), - ggml_element_size(kc)*n_embd*n_ctx, - (ggml_element_size(kc)*n_embd)*(il*n_batch*n_ctx + n_past)); - vc = ggml_set_2d_inplace(ctx0, vc, - ggml_reshape_2d(ctx0, Vcur, N*n_embd, n_batch), - ggml_element_size(vc)*n_ctx*n_embd, - ggml_element_size(vc)*(n_past + il*n_embd*n_batch*n_ctx)); - - assert_shape_1d(kc, n_embd * n_ctx * n_batch * n_layer); - assert_shape_1d(vc, n_embd * n_ctx * n_batch * n_layer); - } - - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Q shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // K shape [n_embd/n_head, n_past + N, n_head, n_batch] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_4d(ctx0, - ggml_view_3d(ctx0, - kc, - n_embd, - (n_past + N), - n_batch, - n_embd*ggml_element_size(kc), - n_ctx*n_embd*ggml_element_size(kc), - il*n_batch*n_ctx*n_embd*ggml_element_size(kc)), - n_embd/n_head, n_head, n_past + N, n_batch), - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, n_past + N, n_head, n_batch); - - // K * Q - // KQ shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - assert_shape_4d(KQ, n_past + N, N, n_head, n_batch); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - assert_shape_4d(KQ_masked, n_past + N, N, n_head, n_batch); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [n_past + N, N, n_head, n_batch] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch); - - // split cached V into n_head heads - // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer] - // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il] - struct ggml_tensor * V = - ggml_view_4d(ctx0, vc, - n_past + N, n_embd/n_head, n_head, n_batch, - ggml_element_size(vc)*n_ctx, - ggml_element_size(vc)*n_ctx*n_embd/n_head, - ggml_element_size(vc)*n_ctx*n_embd, - il*n_batch*n_ctx*n_embd*ggml_element_size(vc)); - assert_shape_4d(V, n_past + N, n_embd/n_head, n_head, n_batch); - - // KQV shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - // KQV_merged shape - - // cur = KQV_merged.contiguous().view(n_embd, N) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - // cur = ggml_cpy(ctx0, - // KQV_merged, - // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection (no bias) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N*n_batch,1,1] - struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // tmp shape [n_ff,N*n_batch,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // SILU activation - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_add_inplace(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); - - // input for next layer - // inpL shape [n_embd,N*n_batch,1,1] - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); - } - - // norm - { - - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - assert_shape_2d(inpL, n_embd, N*n_batch); - - //embeddings = inpL; - } - - // lm_head - // inpL shape [n_vocab,N*n_batch,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); - - { - // inpL shape [n_vocab,N,n_batch,1] - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); - } - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; +static size_t hash(void * p) { + return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE; } -struct ggml_tensor * forward_batch_wo_cache( - struct my_llama_model * model, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_batch) { +static size_t hash_find(void * hash_table[], void * p) { + size_t h = hash(p); - const int n_past = 0; - const int N = n_tokens; - - const auto & hparams = model->hparams; - //const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); - memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); - - // inpL shape [n_embd,N*n_batch,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); + // linear probing + size_t i = h; + while (hash_table[i] != NULL && hash_table[i] != p) { + i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE; + if (i == h) { + // visited all hash table entries -> not found + return GGML_GRAPH_HASHTABLE_SIZE; } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - // Vcur shape [N, n_batch, n_embd/n_head, n_head] - struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); - assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); - - // Qcur shape [n_embd/n_head, n_head, N, n_batch] - // Q shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer] - // K shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * K = - ggml_permute(ctx0, - Kcur, - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); - - // K * Q - // KQ shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - assert_shape_4d(KQ, N, N, n_head, n_batch); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - assert_shape_4d(KQ_scaled, N, N, n_head, n_batch); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); - assert_shape_4d(KQ_masked, N, N, n_head, n_batch); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [N, N, n_head, n_batch] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - assert_shape_4d(KQ_soft_max, N, N, n_head, n_batch); - - // Vcur shape [N, n_batch, n_embd/n_head, n_head] - // V shape [N, n_embd/n_head, n_head, n_batch] - struct ggml_tensor * V = - ggml_permute(ctx0, - Vcur, - 0, 3, 1, 2); - assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); - - // KQV shape [n_embd/n_head, N, n_head, n_batch] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - // KQV_merged shape - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - - // projection (no bias) - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N*n_batch,1,1] - struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // tmp shape [n_ff,N*n_batch,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // SILU activation - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_ff,N*n_batch,1,1] - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // cur shape [n_embd,N*n_batch,1,1] - cur = ggml_add_inplace(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); - - // input for next layer - // inpL shape [n_embd,N*n_batch,1,1] - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); } - - // norm - { - - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - // inpL shape [n_embd,N*n_batch,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - assert_shape_2d(inpL, n_embd, N*n_batch); - - //embeddings = inpL; - } - - // lm_head - // inpL shape [n_vocab,N*n_batch,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); - - { - // inpL shape [n_vocab,N,n_batch,1] - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); - } - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; + return i; } -struct ggml_tensor * forward_batch_wo_cache_flash_attn( - struct my_llama_model * model, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_batch) { +static bool hash_insert(void * hash_table[], void * p) { + //size_t h = hash(p); + size_t i = hash_find(hash_table, p); - const int n_past = 0; - const int N = n_tokens; + GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full - const auto & hparams = model->hparams; - //const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); - memcpy(tokens->data, tokens_input->data, ggml_element_size(tokens)*N*n_batch); - - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - assert_shape_2d(inpL, n_embd, N*n_batch); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // norm - { - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); - assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); - assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); - - struct ggml_tensor * Vcur = ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, cur, model->layers[il].wv), N, n_batch, n_embd/n_head, n_head); - assert_shape_4d(Vcur, N, n_batch, n_embd/n_head, n_head); - - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * K = - ggml_permute(ctx0, - Kcur, - 0, 2, 1, 3); - assert_shape_4d(K, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * V = - ggml_permute(ctx0, - Vcur, - 0, 3, 1, 2); - assert_shape_4d(V, N, n_embd/n_head, n_head, n_batch); - - bool masked = true; - struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, masked); - assert_shape_4d(KQV, n_embd/n_head, N, n_head, n_batch); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - assert_shape_4d(KQV_merged, n_embd/n_head, n_head, N, n_batch); - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N*n_batch); - assert_shape_2d(cur, n_embd, N*n_batch); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - struct ggml_tensor * inpFF = ggml_add_inplace(ctx0, cur, inpSA); - assert_shape_2d(inpFF, n_embd, N*n_batch); - - // feed-forward network - { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - assert_shape_2d(cur, n_embd, N*n_batch); - - // cur = ffn_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - assert_shape_2d(tmp, n_ff, N*n_batch); - - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - // SILU activation - cur = ggml_silu(ctx0, cur); - assert_shape_2d(cur, n_ff, N*n_batch); - - cur = ggml_mul(ctx0, cur, tmp); - assert_shape_2d(cur, n_ff, N*n_batch); - - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - assert_shape_2d(cur, n_embd, N*n_batch); - } - - cur = ggml_add_inplace(ctx0, cur, inpFF); - assert_shape_2d(cur, n_embd, N*n_batch); - - // input for next layer - inpL = cur; - assert_shape_2d(inpL, n_embd, N*n_batch); + if (hash_table[i] == p) { + return true; } - // norm - { - - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - assert_shape_2d(inpL, n_embd, N*n_batch); - - // inpL = norm*inpL - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - assert_shape_2d(inpL, n_embd, N*n_batch); - } - - // lm_head - inpL = ggml_mul_mat(ctx0, model->output, inpL); - assert_shape_2d(inpL, n_vocab, N*n_batch); - - { - inpL = ggml_reshape_3d(ctx0, - inpL, - n_vocab, N, n_batch); - assert_shape_3d(inpL, n_vocab, N, n_batch); - } - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; + // insert + GGML_ASSERT(hash_table[i] == NULL); + hash_table[i] = p; + return false; } -// expand the graph nodes without creating leafs. -struct ggml_tensor * expand(struct ggml_cgraph * g, struct ggml_tensor * t) { - // check if already visited - for (int i = 0; i < g->n_nodes; i++) { - if (g->nodes[i] == t) { - return t; - } - } - - for (int i = 0; i < g->n_leafs; i++) { - if (g->leafs[i] == t) { - return t; - } - } - - for (int i = 0; i < GGML_MAX_SRC; ++i) { - if (t->src[i]) { - expand(g, t->src[i]); - } - } - - GGML_ASSERT(g->n_nodes < GGML_MAX_NODES); - - if (strlen(t->name) == 0) { - snprintf(t->name, sizeof(t->name), "node_%d", g->n_nodes); - } - - g->nodes[g->n_nodes] = t; - g->grads[g->n_nodes] = t->grad; - g->n_nodes++; - return t; +static bool hash_contains(void * hash_table[], void * p) { + size_t i = hash_find(hash_table, p); + return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p); } -void graph_set_leafs_grads(struct ggml_cgraph * g) { - // moves leaf nodes to g->leafs. - // i.e. g->n_nodes might change. - int n_nodes = 0; - for (int i = 0; i < g->n_nodes; ++i) { - struct ggml_tensor * node = g->nodes[i]; - const bool is_leaf = node->op == GGML_OP_NONE && node->grad == NULL; - if (is_leaf) { - GGML_ASSERT(g->n_leafs < GGML_MAX_NODES); +struct hash_map { + void * keys[GGML_GRAPH_HASHTABLE_SIZE]; + void * vals[GGML_GRAPH_HASHTABLE_SIZE]; +}; +//static const size_t HASH_MAP_SIZE = sizeof(struct hash_map); - if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "leaf_%d", g->n_leafs); - } - - g->leafs[g->n_leafs] = node; - g->n_leafs++; - } else { - GGML_ASSERT(n_nodes < GGML_MAX_NODES); - - if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "node_%d", n_nodes); - } - - g->nodes[n_nodes] = node; - g->grads[n_nodes] = node->grad; - n_nodes++; - } +struct hash_map * new_hash_map() { + struct hash_map * result = new struct hash_map; + for (int i=0; ikeys[i] = NULL; + result->vals[i] = NULL; } - for (int i=n_nodes; i < g->n_nodes; ++i) { - g->nodes[n_nodes] = NULL; - g->grads[n_nodes] = NULL; - } - g->n_nodes = n_nodes; + return result; +}; + +void free_hash_map(struct hash_map * map) { + delete map; } -struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( - struct my_llama_model * model, - struct ggml_context * ctx0, +static bool ggml_is_view(struct ggml_tensor * t) { + return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || + t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; +} + +static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { + switch (t->op) { + case GGML_OP_PERMUTE: + case GGML_OP_RESHAPE: + case GGML_OP_TRANSPOSE: + case GGML_OP_VIEW: + return t->src[0]; + case GGML_OP_CPY: + return t->src[1]; + default: + return NULL; + } +} + +static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { + struct ggml_tensor * parent = t; + do { + parent = get_view_parent(parent); + } while (ggml_is_view(parent)); + return parent; +} + +struct ggml_tensor * ggml_recompute_graph_node( + struct ggml_context * ctx, + struct ggml_cgraph * graph, + struct hash_map * replacements, + struct ggml_tensor * node) { + + if (node == NULL) { + return NULL; + } + + if (node->is_param) { + return node; + } + + if (!hash_contains(graph->visited_hash_table, node)) { + return node; + } + + int count_children = 0; + for (int k = 0; k < GGML_MAX_SRC; ++k) { + if (node->src[k]) { + ++count_children; + } + } + + if (count_children == 0) { + return node; + } + + size_t i = hash_find(replacements->keys, node); + GGML_ASSERT(i < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + if (replacements->keys[i] == node) { + return (struct ggml_tensor *) replacements->vals[i]; + } + + struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); + + // insert clone into replacements + GGML_ASSERT(replacements->keys[i] == NULL); // assert that we don't overwrite + replacements->keys[i] = node; + replacements->vals[i] = clone; + + clone->op = node->op; + clone->grad = node->grad; + clone->is_param = node->is_param; + clone->extra = node->extra; + for (int k = 0; k < GGML_MAX_DIMS; ++k) { + clone->nb[k] = node->nb[k]; + } + for (int k = 0; k < GGML_MAX_SRC; ++k) { + clone->src[k] = ggml_recompute_graph_node(ctx, graph, replacements, node->src[k]); + } + if (ggml_is_view(clone)) { + struct ggml_tensor * source = get_view_source(clone); + GGML_ASSERT(source != NULL); + clone->data = source->data; + } + + GGML_ASSERT(sizeof(node->op_params) == sizeof(int32_t) * (GGML_MAX_OP_PARAMS / sizeof(int32_t))); + GGML_ASSERT(sizeof(node->name) == GGML_MAX_NAME); + memcpy(clone->op_params, node->op_params, sizeof(node->op_params)); + ggml_format_name(clone, "%s (clone)", ggml_get_name(node)); + + return clone; +}; + +void ggml_build_backward_gradient_checkpointing( + struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, + struct ggml_tensor * * checkpoints, + int n_checkpoints) { + *gb_tmp = *gf; + ggml_build_backward_expand(ctx, gf, gb_tmp, true); + + if (n_checkpoints <= 0) { + *gb = *gb_tmp; + return; + } + + struct hash_map * replacements = new_hash_map(); + + // insert checkpoints in replacements + for (int i = 0; i < n_checkpoints; ++i) { + size_t k = hash_find(replacements->keys, checkpoints[i]); + GGML_ASSERT(k < GGML_GRAPH_HASHTABLE_SIZE); // assert that not full + GGML_ASSERT(replacements->keys[k] == NULL); // assert that we don't overwrite + replacements->keys[k] = checkpoints[i]; + replacements->vals[k] = checkpoints[i]; + } + + *gb = *gf; + // rewrite gb_tmp->nodes[gf->n_nodes:gb_tmp->n_nodes], + // replacing references to gb_tmp->nodes[0:gf->n_nodes] ( == gf->nodes[0:gf->n_nodes]), + // by recomputing them from checkpoints + for (int i = gf->n_nodes; in_nodes; ++i) { + struct ggml_tensor * node = gb_tmp->nodes[i]; + for (int k = 0; k < GGML_MAX_SRC; ++k) { + // insert new tensors recomputing src, reusing already made replacements, + // remember replacements: remember new tensors with mapping from corresponding gf nodes + // recurse for input tensors, + // unless (i.e. terminating when) input tensors are checkpoints + node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]); + } + // insert rewritten backward node with replacements made into resulting backward graph gb + ggml_build_forward_expand(gb, node); + } + + free_hash_map(replacements); +} + +struct ggml_tensor * llama_build_train_graphs( + struct my_llama_model * model, + struct ggml_allocr * alloc, + struct ggml_context * ctx, + struct ggml_cgraph * gf, + struct ggml_cgraph * gb, + struct ggml_cgraph * gb_tmp, struct ggml_tensor * * logits, struct ggml_tensor * tokens_input, struct ggml_tensor * targets, - void * compute_buf_0, - void * compute_buf_1, - size_t size_buf_0, - size_t size_buf_1, const int n_tokens, - const int n_batch) { - - ggml_set_scratch(ctx0, { 0, 0, nullptr, }); + const int n_batch, + const bool enable_flash_attn, + const bool enable_checkpointing) { + ggml_set_scratch(ctx, { 0, 0, nullptr, }); const int n_past = 0; const int N = n_tokens; - - gf->n_nodes = 0; - gf->n_leafs = 0; - gf->perf_runs = 0; - gf->perf_cycles = 0; - gf->perf_time_us = 0; - const auto & hparams = model->hparams; const int n_ctx = hparams.n_ctx; const int n_vocab = hparams.n_vocab; @@ -1445,476 +671,162 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( const int n_layer = hparams.n_layer; const int n_head = hparams.n_head; const int n_rot = hparams.n_rot; - const int n_ff = get_n_ff(&hparams); - const int rope_mode = 0; + const int n_ff = hparams.n_ff; + const float f_norm_rms_eps = hparams.f_norm_rms_eps; + const float rope_freq_base = hparams.rope_freq_base; + const float rope_freq_scale = hparams.rope_freq_scale; - int last_buf = -1; - size_t buf_offs[2] = { 0, 0 }; - size_t buf_size[2] = { size_buf_0, - size_buf_1 }; - void * buf_data[2] = { compute_buf_0, - compute_buf_1 }; - auto use_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data] (int buf) { - size_t last_offs = 0; - last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - if (last_buf >= 0) { - buf_offs[last_buf] = last_offs; - } - if (buf >= 0) { - size_t offs = buf_offs[buf]; - size_t size = buf_size[buf]; - void * data = buf_data[buf]; - ggml_set_scratch(ctx0, { offs, size, data, }); - } - last_buf = buf; - }; - - bool track_max_mem = false; - size_t buf_maxs[2] = { 0, 0 }; - - auto clr_buf = [ctx0, &last_buf, &buf_offs, &buf_size, &buf_data, &buf_maxs, track_max_mem] (int buf) { - if (buf < 0) return; - if (track_max_mem) { - size_t last_offs = 0; - last_offs = ggml_set_scratch(ctx0, { 0, 0, nullptr, }); - if (last_buf >= 0) { - buf_offs[last_buf] = last_offs; - buf_maxs[last_buf] = std::max(buf_maxs[last_buf], buf_offs[last_buf]); - } - } - buf_offs[buf] = 0; - if (track_max_mem && last_buf >= 0) { - size_t offs = buf_offs[last_buf]; - size_t size = buf_size[last_buf]; - void * data = buf_data[last_buf]; - ggml_set_scratch(ctx0, { offs, size, data, }); + auto set_name = [](struct ggml_tensor * t, const char * n) { + ggml_set_name(t, n); + if (t->grad) { + ggml_format_name(t->grad, "%s->grad", n); } }; + // rope has so much parameters that we make a custom function for it + auto rope = [ctx, n_rot, n_ctx, rope_freq_base, rope_freq_scale] + (struct ggml_tensor * t) -> struct ggml_tensor * { + // not capturing these, to silcence warnings + const int n_past = 0; + const int rope_mode = 0; - auto view__q = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = n_embd/n_head; - int64_t ne1 = N; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = 0; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); + return ggml_rope_custom(ctx, + t, n_past, n_rot, rope_mode, n_ctx, + rope_freq_base, rope_freq_scale); }; - auto view__k = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = n_embd/n_head; - int64_t ne1 = N; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = nb3*ne3; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; + set_name(tokens_input, "tokens_input"); + set_name(targets, "targets"); - auto view__v = [ctx0, n_embd, n_head, N, n_batch] (struct ggml_tensor * t) -> struct ggml_tensor * { - int64_t ne0 = N; - int64_t ne1 = n_embd/n_head; - int64_t ne2 = n_head; - int64_t ne3 = n_batch; - size_t nb0 = ggml_element_size(t); - size_t nb1 = nb0*ne0; - size_t nb2 = nb1*ne1; - size_t nb3 = nb2*ne2; - size_t offset = 2*nb3*ne3; - return ggml_view_4d(ctx0, t, ne0, ne1, ne2, ne3, nb1, nb2, nb3, offset); - }; - - auto add_or_set = [ctx0] (struct ggml_tensor * a, struct ggml_tensor * b) -> struct ggml_tensor * { - if (a == NULL) { - return b; - } else { - return ggml_add_inplace(ctx0, a, b); - } - }; - - use_buf(-1); - - model->tok_embeddings->grad = NULL; - model->norm->grad = NULL; - model->output->grad = NULL; - - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - layer.attention_norm->grad = NULL; - layer.wq->grad = NULL; - layer.wk->grad = NULL; - layer.wv->grad = NULL; - layer.wo->grad = NULL; - layer.ffn_norm->grad = NULL; - layer.w1->grad = NULL; - layer.w2->grad = NULL; - layer.w3->grad = NULL; - } - - clr_buf(0); - clr_buf(1); - - use_buf(-1); - - struct ggml_tensor * t00 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N*n_batch); assert_shape_1d(t00, N*n_batch); - memcpy(t00->data, tokens_input->data, ggml_element_size(t00)*N*n_batch); - - use_buf(-1); - - struct ggml_tensor * t01 = expand(gf, ggml_get_rows(ctx0, model->tok_embeddings, t00)); assert_shape_2d(t01, n_embd, N*n_batch); - - // need to remember these for the backward pass - std::vector t02L; t02L.resize(n_layer, NULL); - std::vector t03L; t03L.resize(n_layer, NULL); - std::vector t04L; t04L.resize(n_layer, NULL); - std::vector t05L; t05L.resize(n_layer, NULL); - std::vector t06L; t06L.resize(n_layer, NULL); - std::vector t07L; t07L.resize(n_layer, NULL); - std::vector t08L; t08L.resize(n_layer, NULL); - std::vector t09L; t09L.resize(n_layer, NULL); - std::vector t10L; t10L.resize(n_layer, NULL); - std::vector t11L; t11L.resize(n_layer, NULL); - std::vector t12L; t12L.resize(n_layer, NULL); - std::vector t13L; t13L.resize(n_layer, NULL); - std::vector t14L; t14L.resize(n_layer, NULL); - std::vector t15L; t15L.resize(n_layer, NULL); - std::vector t16L; t16L.resize(n_layer, NULL); - std::vector t17L; t17L.resize(n_layer, NULL); - std::vector t18L; t18L.resize(n_layer, NULL); - std::vector t19L; t19L.resize(n_layer, NULL); - std::vector t20L; t20L.resize(n_layer, NULL); - std::vector t21L; t21L.resize(n_layer, NULL); - std::vector t22L; t22L.resize(n_layer, NULL); - std::vector t23L; t23L.resize(n_layer, NULL); - std::vector t24L; t24L.resize(n_layer, NULL); - std::vector t25L; t25L.resize(n_layer, NULL); - std::vector t26L; t26L.resize(n_layer, NULL); - std::vector t27L; t27L.resize(n_layer, NULL); - std::vector t28L; t28L.resize(n_layer, NULL); - std::vector t29L; t29L.resize(n_layer, NULL); - std::vector t30L; t30L.resize(n_layer, NULL); + GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); + struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch); + struct ggml_tensor * t01 = ggml_get_rows(ctx, model->tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch); struct ggml_tensor * cur = t01; + std::vector checkpoints; + checkpoints.push_back(tokens_input); + checkpoints.push_back(targets); + checkpoints.push_back(t00); + checkpoints.push_back(t01); + + struct ggml_tensor * kv_scale; + if (!enable_flash_attn) { + kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head)); + } + for (int il = 0; il < n_layer; ++il) { - clr_buf(0); struct my_llama_layer & layer = model->layers[il]; - // tensors with values necessary for backward pass are in persistent buf(-1) - // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed. - use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t02, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat (ctx0, layer.attention_norm, t02)); assert_shape_2d(t03, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul (ctx0, t02, t03)); assert_shape_2d(t04, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat (ctx0, layer.wq, t04)); assert_shape_2d(t05, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t06 = expand(gf, ggml_reshape_4d (ctx0, t05, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t07 = expand(gf, ggml_rope_inplace (ctx0, t06, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t08 = expand(gf, ggml_mul_mat (ctx0, layer.wk, t04)); assert_shape_2d(t08, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t09 = expand(gf, ggml_reshape_4d (ctx0, t08, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t10 = expand(gf, ggml_rope_inplace (ctx0, t09, n_past, n_rot, rope_mode, 0)); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t11 = expand(gf, ggml_mul_mat (ctx0, t04, layer.wv)); assert_shape_2d(t11, N*n_batch, n_embd); - use_buf(-1); struct ggml_tensor * t12 = expand(gf, ggml_reshape_4d (ctx0, t11, N, n_batch, n_embd/n_head, n_head)); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); - use_buf(-1); struct ggml_tensor * t13 = expand(gf, ggml_permute (ctx0, t07, 0, 2, 1, 3)); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t14 = expand(gf, ggml_permute (ctx0, t10, 0, 2, 1, 3)); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t15 = expand(gf, ggml_permute (ctx0, t12, 0, 3, 1, 2)); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); - use_buf(-1); struct ggml_tensor * t16 = expand(gf, ggml_flash_attn (ctx0, t13, t14, t15, true)); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); - use_buf( 0); struct ggml_tensor * t17 = expand(gf, ggml_permute (ctx0, t16, 0, 2, 1, 3)); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t18 = expand(gf, ggml_cont (ctx0, t17)); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); - use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d (ctx0, t18, n_embd, N*n_batch)); assert_shape_2d(t19, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat (ctx0, layer.wo, t19)); assert_shape_2d(t20, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add (ctx0, t20, cur)); assert_shape_2d(t21, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm (ctx0, t21, rms_norm_eps)); assert_shape_2d(t22, n_embd, N*n_batch); - use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat (ctx0, layer.ffn_norm, t22)); assert_shape_2d(t23, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul (ctx0, t23, t22)); assert_shape_2d(t24, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat (ctx0, layer.w3, t24)); assert_shape_2d(t25, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t26 = expand(gf, ggml_mul_mat (ctx0, layer.w1, t24)); assert_shape_2d(t26, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t27 = expand(gf, ggml_silu (ctx0, t26)); assert_shape_2d(t27, n_ff, N*n_batch); - use_buf(-1); struct ggml_tensor * t28 = expand(gf, ggml_mul (ctx0, t27, t25)); assert_shape_2d(t28, n_ff, N*n_batch); - use_buf( 0); struct ggml_tensor * t29 = expand(gf, ggml_mul_mat (ctx0, layer.w2, t28)); assert_shape_2d(t29, n_embd, N*n_batch); - use_buf(-1); struct ggml_tensor * t30 = expand(gf, ggml_add (ctx0, t21, t29)); assert_shape_2d(t30, n_embd, N*n_batch); - t02L[il] = t02; - t03L[il] = t03; - t04L[il] = t04; - t05L[il] = t05; - t06L[il] = t06; - t07L[il] = t07; - t08L[il] = t08; - t09L[il] = t09; - t10L[il] = t10; - t11L[il] = t11; - t12L[il] = t12; - t13L[il] = t13; - t14L[il] = t14; - t15L[il] = t15; - t16L[il] = t16; - t17L[il] = t17; - t18L[il] = t18; - t19L[il] = t19; - t20L[il] = t20; - t21L[il] = t21; - t22L[il] = t22; - t23L[il] = t23; - t24L[il] = t24; - t25L[il] = t25; - t26L[il] = t26; - t27L[il] = t27; - t28L[il] = t28; - t29L[il] = t29; - t30L[il] = t30; - - cur = t30; - } - clr_buf(0); - use_buf(0); - struct ggml_tensor * t31 = expand(gf, ggml_rms_norm (ctx0, cur, rms_norm_eps)); assert_shape_2d(t31, n_embd, N*n_batch); - struct ggml_tensor * t32 = expand(gf, ggml_repeat (ctx0, model->norm, t31)); assert_shape_2d(t32, n_embd, N*n_batch); - struct ggml_tensor * t33 = expand(gf, ggml_mul (ctx0, t32, t31)); assert_shape_2d(t33, n_embd, N*n_batch); - use_buf(-1); - struct ggml_tensor * t34 = expand(gf, ggml_mul_mat (ctx0, model->output, t33)); assert_shape_2d(t34, n_vocab, N*n_batch); - struct ggml_tensor * t35 = expand(gf, ggml_reshape_3d(ctx0, t34, n_vocab, N, n_batch)); assert_shape_3d(t35, n_vocab, N, n_batch); - struct ggml_tensor * t36 = expand(gf, ggml_cross_entropy_loss(ctx0, t35, targets)); assert_shape_1d(t36, 1); - - { - /* - tok_embeddings | grad_tok_embeddings = ggml_get_rows_back(grad_t01, t00) - L0_att_norm | grad_L0_att_norm = ggml_repeat_back(grad_t03L0, L0_att_norm.shape) - L0_wq | grad_L0_wq = ggml_out_prod(t04L0, grad_t05L0) - L0_wk | grad_L0_wk = ggml_out_prod(t04L0, grad_t08L0) - L0_wv | grad_L0_wv = ggml_out_prod(t04L0, ggml_transpose(grad_t11L0)) - L0_wo | grad_L0_wo = ggml_out_prod(t19L0, grad_t20L0) - L0_ffn_norm | grad_L0_ffn_norm = ggml_repeat_back(grad_t23L0, L0_ffn_norm.shape) - L0_w1 | grad_L0_w1 = ggml_out_prod(t24L0, grad_t26L0) - L0_w2 | grad_L0_w2 = ggml_out_prod(t28L0, grad_t29L0) - L0_w3 | grad_L0_w3 = ggml_out_prod(t24L0, grad_t25L0) - L1_att_norm | grad_L1_att_norm = ggml_repeat_back(grad_t03L1, L1_att_norm.shape) - L1_wq | grad_L1_wq = ggml_out_prod(t04L1, grad_t05L1) - L1_wk | grad_L1_wk = ggml_out_prod(t04L1, grad_t08L1) - L1_wv | grad_L1_wv = ggml_out_prod(t04L1, ggml_transpose(grad_t11L1)) - L1_wo | grad_L1_wo = ggml_out_prod(t19L1, grad_t20L1) - L1_ffn_norm | grad_L1_ffn_norm = ggml_repeat_back(grad_t23L1, L1_ffn_norm.shape) - L1_w1 | grad_L1_w1 = ggml_out_prod(t24L1, grad_t26L1) - L1_w2 | grad_L1_w2 = ggml_out_prod(t28L1, grad_t29L1) - L1_w3 | grad_L1_w3 = ggml_out_prod(t24L1, grad_t25L1) - norm | grad_norm = ggml_repeat_back(grad_t32, norm.shape) - output | grad_output = ggml_out_prod(t33, grad_t34) - | - t01 = ggml_get_rows(tok_embeddings, t00) | grad_t01 = grad_t21L0 + ggml_rms_norm_back(t01, grad_t02L0) - for layer: | - t02L0*= ggml_rms_norm (t01) | grad_t02L0 = ggml_mul(grad_t04L0, t03L0) - t03L0 = ggml_repeat (L0_att_norm, t02L0_shape) | grad_t03L0 = ggml_mul(grad_t04L0, t02L0) - t04L0*= ggml_mul (t02L0, t03L0) | grad_t04L0 = ggml_out_prod(L0_wv, grad_t11L0) + ggml_out_prod(L0_wk, ggml_transpose(grad_t08L0)) + ggml_out_prod(L0_wq, ggml_transpose(grad_t05L0)) - t05L0 = ggml_mul_mat (L0_wq, t04L0) | grad_t05L0 = ggml_reshape(grad_t06L0, t05L0_shape) - t06L0 = ggml_reshape_4d (t05L0, n_embd/n_head, n_head, N, n_batch) | grad_t06L0 = ggml_rope_back(grad_t07L0) - t07L0 = ggml_rope_inplace (t06L0) | grad_t07L0 = ggml_permute_back(grad_t13L0, 0, 2, 1, 3) = ggml_permute(grad_t13L0, 0, 2, 1, 3) - t08L0 = ggml_mul_mat (L0_wk, t04L0) | grad_t08L0 = ggml_reshape(grad_t09L0, t08L0_shape) - t09L0 = ggml_reshape_4d (t08L0, n_embd/n_head, n_head, N, n_batch) | grad_t09L0 = ggml_rope_back(grad_t10L0) - t10L0 = ggml_rope_inplace (t09L0) | grad_t10L0 = ggml_permute_back(grad_t14L0, 0, 2, 1, 3) = ggml_permute(grad_t14L0, 0, 2, 1, 3) - t11L0 = ggml_mul_mat (t04L0, L0_wv) | grad_t11L0 = ggml_reshape(grad_t12L0, t11L0_shape) - t12L0 = ggml_reshape_4d (t11L0, N, n_batch, n_embd/n_head, n_head) | grad_t12L0 = ggml_permute_back(grad_t15L0, 0, 3, 1, 2) = ggml_permute(grad_t15L0, 0, 2, 3, 1) - t13L0*= ggml_permute (t07L0, 0, 2, 1, 3) | grad_t13L0 = view__q(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t14L0*= ggml_permute (t10L0, 0, 2, 1, 3) | grad_t14L0 = view__k(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t15L0*= ggml_permute (t12L0, 0, 3, 1, 2) | grad_t15L0 = view__v(ggml_flash_attn_back(t13L0, t14L0, t15L0, grad_t16L0)) - t16L0 = ggml_flash_attn (t13L0, t14L0, t15L0) | grad_t16L0 = ggml_permute_back(grad_t17L0, 0, 2, 1, 3) = ggml_permute(grad_t17L0, 0, 2, 1, 3) - t17L0 = ggml_permute (t16L0, 0, 2, 1, 3) | grad_t17L0 = grad_t18L0 - t18L0 = ggml_cont (t17L0) | grad_t18L0 = ggml_reshape(grad_t19L0, t18L0_shape) - t19L0*= ggml_reshape_2d (t18L0, n_embd, N*n_batch) | grad_t19L0 = ggml_out_prod(L0_wo, ggml_transpose(grad_t20L0)) - t20L0 = ggml_mul_mat (L0_wo, t19L0) | grad_t20L0 = grad_t21L0 - t21L0*= ggml_add (t20L0, t01) | grad_t21L0 = grad_t30L0 + ggml_rms_norm_back(t21L0, grad_t22L0) - t22L0*= ggml_rms_norm (t21L0) | grad_t22L0 = ggml_mul(grad_t24L0, t23L0) - t23L0 = ggml_repeat (L0_ffn_norm, t22L0_shape) | grad_t23L0 = ggml_mul(grad_t24L0, t22L0) - t24L0*= ggml_mul (t23L0, t22L0) | grad_t24L0 = ggml_out_prod(L0_w1, ggml_transpose(grad_t26L0)) + ggml_out_prod(L0_w3, ggml_transpose(grad_t25L0)) - t25L0*= ggml_mul_mat (L0_w3, t24L0) | grad_t25L0 = ggml_mul(grad_t28L0, t27L0) - t26L0*= ggml_mul_mat (L0_w1, t24L0) | grad_t26L0 = ggml_silu_back(t26L0, grad_t27L0) - t27L0*= ggml_silu (t26L0) | grad_t27L0 = ggml_mul(grad_t28L0, t25L0) - t28L0*= ggml_mul (t27L0, t25L0) | grad_t28L0 = ggml_out_prod(L0_w2, ggml_transpose(grad_t29L0)) - t29L0 = ggml_mul_mat (L0_w2, t28L0) | grad_t29L0 = grad_t30L0 - t30L0*= ggml_add (t21L0, t29L0) | grad_t30L0 = ggml_rms_norm_back(t30L0, grad_t02L1) + grad_t21L1 - ^ - t02L1*= ggml_rms_norm (t30L0) | grad_t02L1 = ggml_mul(grad_t04L1, t03L1) - t03L1 = ggml_repeat (L1_att_norm, t02L1_shape) | grad_t03L1 = ggml_mul(grad_t04L1, t02L1) - t04L1*= ggml_mul (t02L1, t03L1) | grad_t04L1 = ggml_out_prod(L1_wv, grad_t11L1) + ggml_out_prod(L1_wk, ggml_transpose(grad_t08L1)) + ggml_out_prod(L1_wq, ggml_transpose(grad_t05L1)) - t05L1 = ggml_mul_mat (L1_wq, t04L1) | grad_t05L1 = ggml_reshape(grad_t06L1, t05L1_shape) - t06L1 = ggml_reshape_4d (t05L1, n_embd/n_head, n_head, N, n_batch) | grad_t06L1 = ggml_rope_back(grad_t07L1) - t07L1 = ggml_rope_inplace (t06L1) | grad_t07L1 = ggml_permute_back(grad_t13L1, 0, 2, 1, 3) = ggml_permute(grad_t13L1, 0, 2, 1, 3) - t08L1 = ggml_mul_mat (L1_wk, t04L1) | grad_t08L1 = ggml_reshape(grad_t09L1, t08L1_shape) - t09L1 = ggml_reshape_4d (t08L1, n_embd/n_head, n_head, N, n_batch) | grad_t09L1 = ggml_rope_back(grad_t10L1) - t10L1 = ggml_rope_inplace (t09L1) | grad_t10L1 = ggml_permute_back(grad_t14L1, 0, 2, 1, 3) = ggml_permute(grad_t14L1, 0, 2, 1, 3) - t11L1 = ggml_mul_mat (t04L1, L1_wv) | grad_t11L1 = ggml_reshape(grad_t12L1, t11L1_shape) - t12L1 = ggml_reshape_4d (t11L1, N, n_batch, n_embd/n_head, n_head) | grad_t12L1 = ggml_permute_back(grad_t15L1, 0, 3, 1, 2) = ggml_permute(grad_t15L1, 0, 2, 3, 1) - t13L1*= ggml_permute (t07L1, 0, 2, 1, 3) | grad_t13L1 = view__q(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t14L1*= ggml_permute (t10L1, 0, 2, 1, 3) | grad_t14L1 = view__k(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t15L1*= ggml_permute (t12L1, 0, 3, 1, 2) | grad_t15L1 = view__v(ggml_flash_attn_back(t13L1, t14L1, t15L1, grad_t16L1)) - t16L1 = ggml_flash_attn (t13L1, t14L1, t15L1) | grad_t16L1 = ggml_permute_back(grad_t17L1, 0, 2, 1, 3) = ggml_permute(grad_t17L1, 0, 2, 1, 3) - t17L1 = ggml_permute (t16L1, 0, 2, 1, 3) | grad_t17L1 = grad_t18L1 - t18L1 = ggml_cont (t17L1) | grad_t18L1 = ggml_reshape(grad_t19L1, t18L1_shape) - t19L1*= ggml_reshape_2d (t18L1, n_embd, N*n_batch) | grad_t19L1 = ggml_out_prod(L1_wo, ggml_transpose(grad_t20L1)) - t20L1 = ggml_mul_mat (L1_wo, t19L1) | grad_t20L1 = grad_t21L1 - t21L1*= ggml_add (t20L1, t30L0) | grad_t21L1 = grad_t30L1 + ggml_rms_norm_back(t21L1, grad_t22L1) - t22L1*= ggml_rms_norm (t21L1) | grad_t22L1 = ggml_mul(grad_t24L1, t23L1) - t23L1 = ggml_repeat (L1_ffn_norm, t22L1_shape) | grad_t23L1 = ggml_mul(grad_t24L1, t22L1) - t24L1*= ggml_mul (t23L1, t22L1) | grad_t24L1 = ggml_out_prod(L1_w1, ggml_transpose(grad_t26L1)) + ggml_out_prod(L1_w3, ggml_transpose(grad_t25L1)) - t25L1*= ggml_mul_mat (L1_w3, t24L1) | grad_t25L1 = ggml_mul(grad_t28L1, t27L1) - t26L1*= ggml_mul_mat (L1_w1, t24L1) | grad_t26L1 = ggml_silu_back(t26L1, grad_t27L1) - t27L1*= ggml_silu (t26L1) | grad_t27L1 = ggml_mul(grad_t28L1, t25L1) - t28L1*= ggml_mul (t27L1, t25L1) | grad_t28L1 = ggml_out_prod(L1_w2, ggml_transpose(grad_t29L1)) - t29L1 = ggml_mul_mat (L1_w2, t28L1) | grad_t29L1 = grad_t30L1 - t30L1*= ggml_add (t21L1, t29L1) | grad_t30L1 = ggml_rms_norm_back(t30L1, grad_t31) - ^ - t31 = ggml_rms_norm (t30L1) | grad_t31 = ggml_mul(grad_t33, t32) - t32 = ggml_repeat (norm, t31.shape) | grad_t32 = ggml_mul(grad_t33, t31) - t33 = ggml_mul (t32, t31) | grad_t33 = ggml_out_prod(output, ggml_transpose(grad_t34)) - t34 = ggml_mul_mat (output, t33) | grad_t34 = ggml_reshape(grad_t35, t34.shape) - t35 = ggml_reshape_3d (t34, n_vocab, N, n_batch) | grad_t35 = ggml_cross_entropy_loss_back(t35, targets, grad_t36) - t36 = ggml_cross_entropy_loss(t35, targets) | grad_t36 = 1 (optimizer) - tensors marked with * need to be stored until grad computation - tensors during grad computation are all temporary - */ - } - - *gb = *gf; - - // t36->grad gets set to one by optimizer, so we need the tensor. - // initialize it with 1.0f to make sure. - use_buf(-1); - t36->grad = expand(gb, ggml_new_f32(ctx0, 1.0f)); - - use_buf(0); - t35->grad = expand(gb, ggml_cross_entropy_loss_back(ctx0, t35, targets, t36->grad)); assert_shape_3d(t35->grad, n_vocab, N, n_batch); - t34->grad = expand(gb, ggml_reshape_2d (ctx0, t35->grad, n_vocab, N*n_batch)); assert_shape_2d(t34->grad, n_vocab, N*n_batch); - t33->grad = expand(gb, ggml_out_prod (ctx0, model->output, ggml_transpose(ctx0, t34->grad))); assert_shape_2d(t33->grad, n_embd, N*n_batch); - t32->grad = expand(gb, ggml_mul (ctx0, t33->grad, t31)); assert_shape_2d(t32->grad, n_embd, N*n_batch); - - use_buf(-1); - - model->norm->grad = expand(gb, add_or_set(model->norm->grad, ggml_repeat_back(ctx0, t32->grad, model->norm))); assert_shape_1d(model->norm->grad, n_embd); - model->output->grad = expand(gb, add_or_set(model->output->grad, ggml_out_prod(ctx0, t33, t34->grad))); assert_shape_2d(model->output->grad, n_embd, n_vocab); - - clr_buf(1); - use_buf(1); - t31->grad = expand(gb, ggml_mul(ctx0, t33->grad, t32)); assert_shape_2d(t31->grad, n_embd, N*n_batch); - - struct ggml_tensor * back_layer_inp = t31; - struct ggml_tensor * grad_layer_inp = NULL; - - for (int k = 0; k < n_layer; ++k) { - int il = n_layer-1-k; - struct my_llama_layer & layer = model->layers[il]; - - struct ggml_tensor * t02 = t02L[il]; - struct ggml_tensor * t03 = t03L[il]; - struct ggml_tensor * t04 = t04L[il]; - struct ggml_tensor * t05 = t05L[il]; - struct ggml_tensor * t06 = t06L[il]; - struct ggml_tensor * t07 = t07L[il]; - struct ggml_tensor * t08 = t08L[il]; - struct ggml_tensor * t09 = t09L[il]; - struct ggml_tensor * t10 = t10L[il]; - struct ggml_tensor * t11 = t11L[il]; - struct ggml_tensor * t12 = t12L[il]; - struct ggml_tensor * t13 = t13L[il]; - struct ggml_tensor * t14 = t14L[il]; - struct ggml_tensor * t15 = t15L[il]; - struct ggml_tensor * t16 = t16L[il]; - struct ggml_tensor * t17 = t17L[il]; - struct ggml_tensor * t18 = t18L[il]; - struct ggml_tensor * t19 = t19L[il]; - struct ggml_tensor * t20 = t20L[il]; - struct ggml_tensor * t21 = t21L[il]; - struct ggml_tensor * t22 = t22L[il]; - struct ggml_tensor * t23 = t23L[il]; - struct ggml_tensor * t24 = t24L[il]; - struct ggml_tensor * t25 = t25L[il]; - struct ggml_tensor * t26 = t26L[il]; - struct ggml_tensor * t27 = t27L[il]; - struct ggml_tensor * t28 = t28L[il]; - struct ggml_tensor * t29 = t29L[il]; - struct ggml_tensor * t30 = t30L[il]; - - clr_buf(0); - use_buf(0); - t30->grad = expand(gb, ggml_rms_norm_back(ctx0, t30, back_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); - if (grad_layer_inp) { - t30->grad = expand(gb, ggml_add(ctx0, t30->grad, grad_layer_inp->grad)); assert_shape_2d(t30->grad, n_embd, N*n_batch); + struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, f_norm_rms_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); + struct ggml_tensor * t03 = ggml_repeat (ctx, layer.attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); + struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch); + struct ggml_tensor * t05 = ggml_mul_mat (ctx, layer.wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch); + struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd/n_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t08 = ggml_mul_mat (ctx, layer.wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd, N*n_batch); + struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd/n_head, n_head, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t11 = ggml_mul_mat (ctx, t04, layer.wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd); + struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd/n_head, n_head); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd/n_head, n_head); + struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd/n_head, N, n_head, n_batch); + struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd/n_head, N, n_head, n_batch); + struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch); + struct ggml_tensor * t16; + if (enable_flash_attn) { + t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); + } else { + struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); + struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); + struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch); + struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch); + t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch); } - clr_buf(1); - t29->grad = t30->grad; assert_shape_2d(t29->grad, n_embd, N*n_batch); - t28->grad = expand(gb, ggml_out_prod(ctx0, layer.w2, ggml_transpose(ctx0, t29->grad))); assert_shape_2d(t28->grad, n_ff, N*n_batch); - t27->grad = expand(gb, ggml_mul(ctx0, t28->grad, t25)); assert_shape_2d(t27->grad, n_ff, N*n_batch); - t26->grad = expand(gb, ggml_silu_back(ctx0, t26, t27->grad)); assert_shape_2d(t26->grad, n_ff, N*n_batch); - t25->grad = expand(gb, ggml_mul(ctx0, t28->grad, t27)); assert_shape_2d(t25->grad, n_ff, N*n_batch); - t24->grad = expand(gb, ggml_add_inplace(ctx0, - ggml_out_prod(ctx0, layer.w1, ggml_transpose(ctx0, t26->grad)), - ggml_out_prod(ctx0, layer.w3, ggml_transpose(ctx0, t25->grad)))); assert_shape_2d(t24->grad, n_embd, N*n_batch); - t23->grad = expand(gb, ggml_mul(ctx0, t24->grad, t22)); assert_shape_2d(t23->grad, n_embd, N*n_batch); - t22->grad = expand(gb, ggml_mul(ctx0, t24->grad, ggml_repeat(ctx0, layer.ffn_norm, t24->grad))); assert_shape_2d(t22->grad, n_embd, N*n_batch); - use_buf(1); - t21->grad = expand(gb, ggml_add(ctx0, t30->grad, ggml_rms_norm_back(ctx0, t21, t22->grad))); assert_shape_2d(t21->grad, n_embd, N*n_batch); - grad_layer_inp = t21; - use_buf(0); - t20->grad = t21->grad; assert_shape_2d(t20->grad, n_embd, N*n_batch); - t19->grad = expand(gb, ggml_out_prod(ctx0, layer.wo, ggml_transpose(ctx0, t20->grad))); assert_shape_2d(t19->grad, n_embd, N*n_batch); - t18->grad = expand(gb, ggml_reshape_4d(ctx0, t19->grad, n_embd/n_head, n_head, N, n_batch)); assert_shape_4d(t18->grad, n_embd/n_head, n_head, N, n_batch); - t17->grad = t18->grad; assert_shape_4d(t17->grad, n_embd/n_head, n_head, N, n_batch); - t16->grad = expand(gb, ggml_permute(ctx0, t17->grad, 0, 2, 1, 3)); assert_shape_4d(t16->grad, n_embd/n_head, N, n_head, n_batch); - struct ggml_tensor * flash_attn = expand(gb, ggml_flash_attn_back(ctx0, t13, t14, t15, t16->grad, true)); assert_shape_4d(flash_attn, n_embd/n_head, N*3, n_head, n_batch); - t15->grad = expand(gb, view__v(flash_attn)); assert_shape_4d(t15->grad, N, n_embd/n_head, n_head, n_batch); - t14->grad = expand(gb, view__k(flash_attn)); assert_shape_4d(t14->grad, n_embd/n_head, N, n_head, n_batch); - t13->grad = expand(gb, view__q(flash_attn)); assert_shape_4d(t13->grad, n_embd/n_head, N, n_head, n_batch); - t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head); - t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd); - t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch); - t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch); - t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch); - t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch); - t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch); - t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch); - t04->grad = expand(gb, ggml_add_inplace(ctx0, - ggml_add_inplace(ctx0, - ggml_out_prod(ctx0, layer.wv, t11->grad), - ggml_out_prod(ctx0, layer.wk, ggml_transpose(ctx0, t08->grad))), - ggml_out_prod(ctx0, layer.wq, ggml_transpose(ctx0, t05->grad)))); assert_shape_2d(t04->grad, n_embd, N*n_batch); - t03->grad = expand(gb, ggml_mul(ctx0, t04->grad, t02)); assert_shape_2d(t04->grad, n_embd, N*n_batch); - use_buf(1); - t02->grad = expand(gb, ggml_mul(ctx0, t04->grad, ggml_repeat(ctx0, layer.attention_norm, t02))); assert_shape_2d(t02->grad, n_embd, N*n_batch); - back_layer_inp = t02; - // use_buf(0); - - use_buf(-1); - layer.attention_norm->grad = expand(gb, add_or_set(layer.attention_norm->grad, ggml_repeat_back(ctx0, t03->grad, layer.attention_norm))); assert_shape_1d(layer.attention_norm->grad, n_embd); - layer.wq->grad = expand(gb, add_or_set(layer.wq->grad, ggml_out_prod(ctx0, t04, t05->grad))); assert_shape_2d(layer.wq->grad, n_embd, n_embd); - layer.wk->grad = expand(gb, add_or_set(layer.wk->grad, ggml_out_prod(ctx0, t04, t08->grad))); assert_shape_2d(layer.wk->grad, n_embd, n_embd); - layer.wv->grad = expand(gb, add_or_set(layer.wv->grad, ggml_out_prod(ctx0, t04, ggml_transpose(ctx0, t11->grad)))); assert_shape_2d(layer.wv->grad, n_embd, n_embd); - layer.wo->grad = expand(gb, add_or_set(layer.wo->grad, ggml_out_prod(ctx0, t19, t20->grad))); assert_shape_2d(layer.wo->grad, n_embd, n_embd); - layer.ffn_norm->grad = expand(gb, add_or_set(layer.ffn_norm->grad, ggml_repeat_back(ctx0, t23->grad, layer.ffn_norm))); assert_shape_1d(layer.ffn_norm->grad, n_embd); - layer.w1->grad = expand(gb, add_or_set(layer.w1->grad, ggml_out_prod(ctx0, t24, t26->grad))); assert_shape_2d(layer.w1->grad, n_embd, n_ff); - layer.w2->grad = expand(gb, add_or_set(layer.w2->grad, ggml_out_prod(ctx0, t28, t29->grad))); assert_shape_2d(layer.w2->grad, n_ff, n_embd); - layer.w3->grad = expand(gb, add_or_set(layer.w3->grad, ggml_out_prod(ctx0, t24, t25->grad))); assert_shape_2d(layer.w3->grad, n_embd, n_ff); - // use_buf(0); + struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd/n_head, n_head, N, n_batch); + struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch); + struct ggml_tensor * t20 = ggml_mul_mat (ctx, layer.wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch); + struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch); + struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, f_norm_rms_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); + struct ggml_tensor * t23 = ggml_repeat (ctx, layer.ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); + struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, layer.w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, layer.w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); + struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, layer.w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); + cur = t30; + checkpoints.push_back(cur); + } + struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, f_norm_rms_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch); + struct ggml_tensor * t32 = ggml_repeat (ctx, model->norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch); + struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch); + struct ggml_tensor * t34 = ggml_mul_mat (ctx, model->output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch); + struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch); + struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1); + + checkpoints.push_back(t31); + checkpoints.push_back(t32); + checkpoints.push_back(t33); + checkpoints.push_back(t34); + checkpoints.push_back(t35); + checkpoints.push_back(t36); + + ggml_build_forward_expand(gf, t36); + + if (enable_checkpointing) { + ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); + } else { + *gb = *gf; + ggml_build_backward_expand(ctx, gf, gb, true); + } + + if (alloc) { + // make sure some tensors are not reallocated by inserting new temporary nodes depending on them + int n_leafs_before = gb->n_leafs; + int n_nodes_before = gb->n_nodes; + struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f); + // output tensors + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one)); + // input gradient + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one)); + GGML_ASSERT(t36->grad->data == NULL && !ggml_is_view(t36->grad)); + ggml_allocr_alloc(alloc, t36->grad); + // gradient tensors (will be set to zero by ggml_graph_reset) + // pinning these produces large unnecessary memory overhead, which will be resolved by PR 2632 + for (int i = 0; i < gf->n_nodes; ++i) { + if (!gf->grads[i]) continue; + if (gf->grads[i]->data == NULL && !ggml_is_view(gf->grads[i])) { + ggml_allocr_alloc(alloc, gf->grads[i]); + } + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, gf->grads[i], one)); + } + // allocating checkpoints in one block to reduce memory fragmentation + // note: they will be freed in reverse order + for (int i = 0; i < (int) checkpoints.size(); ++i) { + if (checkpoints[i]->data == NULL && !ggml_is_view(checkpoints[i])) { + ggml_allocr_alloc(alloc, checkpoints[i]); + } + } + + //int n_leafs_after = gb->n_leafs; + //int n_nodes_after = gb->n_nodes; + + ggml_allocr_alloc_graph(alloc, gb); + + // remove the additional nodes and leafs + for (int i = n_leafs_before; i < gb->n_leafs; ++i) { + gb->leafs[i] = NULL; + } + for (int i = n_nodes_before; i < gb->n_nodes; ++i) { + gb->nodes[i] = NULL; + } + gb->n_leafs = n_leafs_before; + gb->n_nodes = n_nodes_before; } - clr_buf(0); - use_buf(0); - t01->grad = expand(gb, ggml_add_inplace(ctx0, grad_layer_inp->grad, ggml_rms_norm_back(ctx0, t01, back_layer_inp->grad))); assert_shape_2d(t01->grad, n_embd, N*n_batch); - use_buf(-1); - model->tok_embeddings->grad = expand(gb, ggml_get_rows_back(ctx0, t01->grad, t00, model->tok_embeddings)); assert_shape_2d(model->tok_embeddings->grad, n_embd, n_vocab); - // clr_buf(1); - // clr_buf(0); *logits = t35; - - if (track_max_mem) { - printf("%s: max size compute buf0: %zu\n", __func__, buf_maxs[0]); - printf("%s: max size compute buf1: %zu\n", __func__, buf_maxs[1]); - } - - // now that all grads are created, set the graph leafs and grads - graph_set_leafs_grads(gf); - graph_set_leafs_grads(gb); - return t36; } @@ -1962,42 +874,6 @@ void print_matrix(struct ggml_tensor * probs) { } } - -void print_token(struct llama_context * ctx, llama_token token) { - printf("%s", llama_token_to_str(ctx, token).c_str()); -} - -void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { - for (int i=0; ine[0]; ++i) { - int token = ggml_get_i32_1d(tokens, i); - print_token(ctx, token); - } -} - -void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) { - for (int i1=0; i1ne[1]; ++i1) { - //int num_newline = 0; - for (int i0=0; i0ne[0]; ++i0) { - int token = get_i32_2d(tokens, i0, i1); - print_token(ctx, token); - // bool isnl = (token == llama_token_nl()); - // if (isnl) { - // ++num_newline; - // } - // if (isnl) { - // if (num_newline < 2) { - // print_token(ctx, token); - // } else { - // printf("\\n"); - // } - // } else { - // print_token(ctx, token); - // } - } - printf("\n--\n"); - } -} - void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { int n_tokens = tokens_input->ne[0]; int n_vocab = target_logits->ne[0]; @@ -2033,51 +909,27 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa ggml_set_f32(target_logits, -1.0f/n_vocab); ggml_set_f32(target_probs, 0.0f); + // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); for (int k=0; kne[0]; - int n_vocab = target_logits->ne[0]; - for (int i=0; i chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - throw std::runtime_error(format("write error: %s", strerror(errno))); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - int tokenize_file(struct llama_context * lctx, const char * filename, std::vector& out) { - struct llama_file f(filename, "rb"); + FILE * fp = std::fopen(filename, "rb"); + if (fp == NULL) { + return 0; + } + +#ifdef _WIN32 + GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_END) == 0); +#else + GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_END) == 0); +#endif + + size_t size = 0; +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); + size = ret; +#else + long ret = std::ftell(fp); + size = ret; +#endif + +#ifdef _WIN32 + GGML_ASSERT(_fseeki64(fp, (__int64) 0, SEEK_SET) == 0); +#else + GGML_ASSERT(std::fseek(fp, (long) 0, SEEK_SET) == 0); +#endif std::vector buf; - buf.resize(f.size+1); + buf.resize(size+1); + out.resize(size+1); - f.read_raw(buf.data(), f.size); - buf[f.size] = '\0'; + if (std::fread(buf.data(), size, 1, fp) != 1) { + throw std::runtime_error(std::string("unexpectedly reached end of file")); + } + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + + buf[size] = '\0'; int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); if (n_tokens < 0) { out.resize(-n_tokens); - llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); + n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false); } + GGML_ASSERT(n_tokens >= 0); + out.resize(n_tokens); bool verify = false; if (verify) { const char * in = buf.data(); const char * end = buf.data() + buf.size(); for (int i = 0; i < (int) out.size(); ++i) { - std::string s = llama_token_to_str(lctx, out[i]); + std::string s = llama_token_to_piece(lctx, out[i]); int len = s.length(); if (in >= end) { printf("%s: unexpected end of original text.\n", __func__); @@ -2238,438 +1040,466 @@ void shuffle_ints(int * begin, int * end) { }); } -struct my_llama_sampler_params { - float temp = 0.0f; // <= 0.0 disabled - int top_k = 20; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - int repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float repeat_penalty = 1.0f; // 1.0 = disabled - float alpha_presence = 0.0f; // 0.0 = disabled - float alpha_frequency = 0.0f; // 0.0 = disabled - int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token -}; - -struct my_llama_sampler { - struct llama_context * ctx = NULL; - my_llama_sampler_params params; - - int n_vocab = 0; - int n_ctx = 0; - - float mirostat_mu; - - std::vector candidates; - llama_token_data_array candidates_p; - -}; - -void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) { - sampler->ctx = ctx; - sampler->n_vocab = llama_n_vocab(sampler->ctx); - sampler->n_ctx = llama_n_ctx(sampler->ctx); - sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau; +#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ +{ \ + const std::string skey(key); \ + const int kid = gguf_find_key(ctx, skey.c_str()); \ + if (kid >= 0) { \ + enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ + if (ktype != (type)) { \ + throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \ + } \ + (dst) = func(ctx, kid); \ + } else if (req) { \ + throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \ + } \ } -llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) { - GGML_ASSERT(sampler->ctx != NULL); - struct llama_context * ctx = sampler->ctx; +bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) { + GGML_ASSERT(a != NULL); + GGML_ASSERT(b != NULL); + GGML_ASSERT(a->type == b->type); + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(ggml_is_contiguous(a) && ggml_is_contiguous(b)); - sampler->candidates.resize(sampler->n_vocab); - for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) { - sampler->candidates[token_id].id = token_id; - sampler->candidates[token_id].logit = logits[token_id]; - sampler->candidates[token_id].p = 0.0; + return true; +} + +void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) { + if (dst == NULL) { + return; } + struct ggml_tensor * t = ggml_get_tensor(ctx, name); + GGML_ASSERT(are_same_layout(dst, t)); + memcpy(dst->data, t->data, ggml_nbytes(t)); - llama_token_data_array * candidates_p = & sampler->candidates_p; - - candidates_p->data = sampler->candidates.data(); - candidates_p->size = sampler->candidates.size(); - candidates_p->sorted = false; - - const auto params = sampler->params; - - // Apply penalties - const float nl_logit = logits[llama_token_nl(ctx)]; - - const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx); - - llama_sample_repetition_penalty( - ctx, - candidates_p, - last_tokens + n_last_tokens - n_last, - n_last, - params.repeat_penalty); - llama_sample_frequency_and_presence_penalties( - ctx, - candidates_p, - last_tokens + n_last_tokens - n_last, - n_last, - params.alpha_frequency, - params.alpha_presence); - - if (!params.penalize_nl) { - logits[llama_token_nl(ctx)] = nl_logit; + if (strlen(ggml_get_name(dst)) == 0) { + ggml_set_name(dst, name); } +} - llama_token token = 0; - if (params.temp <= 0) { - // Greedy sampling - token = llama_sample_token_greedy(ctx, candidates_p); +void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_FILE_VERSION); + GGML_ASSERT(file_version == 0); + + GGUF_GET_KEY(fctx, opt->params.past, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT); + GGUF_GET_KEY(fctx, opt->iter, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ITERATION_COUNT); + GGUF_GET_KEY(fctx, opt->just_initialized, gguf_get_val_bool, GGUF_TYPE_BOOL, true, LLM_KV_OPTIMIZER_JUST_INITIALIZED); + + uint64_t nx; + GGUF_GET_KEY(fctx, nx, gguf_get_val_u64, GGUF_TYPE_UINT64, true, LLM_KV_OPTIMIZER_PARAMETER_COUNT); + opt->nx = (size_t) nx; + + // don't call ggml_opt_init until optimizer type and optimizer specific parameters are know + + std::string opt_type; + GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); + if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { + opt->params.type = GGML_OPT_ADAM; + + GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); + GGUF_GET_KEY(fctx, opt->adam.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT); + + GGML_ASSERT(opt->ctx != NULL); + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + read_tensor_by_name(opt->adam.m, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + read_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + read_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { + opt->params.type = GGML_OPT_LBFGS; + + GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); + GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); + GGUF_GET_KEY(fctx, opt->lbfgs.step, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP); + GGUF_GET_KEY(fctx, opt->lbfgs.j, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J); + GGUF_GET_KEY(fctx, opt->lbfgs.k, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K); + GGUF_GET_KEY(fctx, opt->lbfgs.end, gguf_get_val_i32, GGUF_TYPE_INT32, true, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END); + GGUF_GET_KEY(fctx, opt->lbfgs.n_no_improvement, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT); + + GGML_ASSERT(opt->ctx != NULL); + ggml_opt_init(opt->ctx, opt, opt->params, opt->nx); + + read_tensor_by_name(opt->lbfgs.x, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + read_tensor_by_name(opt->lbfgs.xp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + read_tensor_by_name(opt->lbfgs.g, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + read_tensor_by_name(opt->lbfgs.gp, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + read_tensor_by_name(opt->lbfgs.d, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + read_tensor_by_name(opt->lbfgs.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + read_tensor_by_name(opt->lbfgs.lmal, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + read_tensor_by_name(opt->lbfgs.lmys, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + read_tensor_by_name(opt->lbfgs.lms, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + read_tensor_by_name(opt->lbfgs.lmy, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); } else { - if (params.mirostat == 1) { - int mirostat_m = 100; - llama_sample_temperature(ctx, candidates_p, params.temp); - token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu); - } else if (params.mirostat == 2) { - llama_sample_temperature(ctx, candidates_p, params.temp); - token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k (ctx, candidates_p, params.top_k, 1); - llama_sample_tail_free (ctx, candidates_p, params.tfs_z, 1); - llama_sample_typical (ctx, candidates_p, params.typical_p, 1); - - llama_sample_top_p (ctx, candidates_p, params.top_p, 1); - llama_sample_temperature (ctx, candidates_p, params.temp); - token = llama_sample_token(ctx, candidates_p); - } - } - return token; -} - -void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, float value) { - GGML_ASSERT(logits->ne[0] == (int64_t) mask.size()); - for (int i2 = 0; i2 < logits->ne[2]; ++i2) { - for (int i1 = 0; i1 < logits->ne[1]; ++i1) { - for (int i0 = 0; i0 < logits->ne[0]; ++i0) { - if (!mask[i0]) continue; - float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]); - *ptr = value; - } - } + throw std::runtime_error("unknown optimizer type\n"); } } -void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - return; - } - const char * name = ggml_get_name(tensor); - uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->write_raw(tensor->data, ggml_nbytes(tensor)); -} - -void read_tensor(struct llama_file * file, struct ggml_tensor * tensor) { - int32_t nd = file->read_u32(); - GGML_ASSERT(nd == tensor->n_dims); - - uint32_t name_len = file->read_u32(); - enum ggml_type type = (enum ggml_type) file->read_u32(); - GGML_ASSERT(type == tensor->type); - - uint32_t ne[4]; - file->read_raw(ne, sizeof(ne[0]) * nd); - for (int i=0; ine[i]); - } - - std::string name = file->read_string(name_len); - GGML_ASSERT(strncmp(ggml_get_name(tensor), name.c_str(), sizeof(tensor->name)-1) == 0); - - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->read_raw(tensor->data, ggml_nbytes(tensor)); -} - -void write_opt_context(struct llama_file * file, struct ggml_opt_context * opt) { - const uint32_t version = 0; - GGML_ASSERT(opt->nx >= 0); - GGML_ASSERT(opt->iter >= 0); - file->write_u32(version); - file->write_raw(&opt->params, sizeof(opt->params)); - file->write_raw(&opt->nx, sizeof(opt->nx)); - file->write_raw(&opt->iter, sizeof(opt->iter)); - file->write_u32((uint32_t) opt->just_initialized); - switch (opt->params.type) { - case GGML_OPT_ADAM: - { - GGML_ASSERT(opt->adam.x != NULL); - write_tensor(file, opt->adam.x); - write_tensor(file, opt->adam.g1); - write_tensor(file, opt->adam.g2); - write_tensor(file, opt->adam.m); - write_tensor(file, opt->adam.v); - write_tensor(file, opt->adam.mh); - write_tensor(file, opt->adam.vh); - write_tensor(file, opt->adam.pf); - file->write_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); - file->write_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); - file->write_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); - } break; - case GGML_OPT_LBFGS: - { - GGML_ASSERT(opt->adam.x != NULL); - write_tensor(file, opt->lbfgs.x); - write_tensor(file, opt->lbfgs.xp); - write_tensor(file, opt->lbfgs.g); - write_tensor(file, opt->lbfgs.gp); - write_tensor(file, opt->lbfgs.d); - write_tensor(file, opt->lbfgs.pf); - write_tensor(file, opt->lbfgs.lmal); - write_tensor(file, opt->lbfgs.lmys); - write_tensor(file, opt->lbfgs.lms); - write_tensor(file, opt->lbfgs.lmy); - file->write_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); - file->write_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); - file->write_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); - file->write_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); - file->write_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); - file->write_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); - } break; - } -} - -void read_opt_context(struct llama_file * file, struct ggml_context * ctx, struct ggml_opt_context * opt) { - uint32_t version = file->read_u32(); - GGML_ASSERT(version == 0); - - file->read_raw(&opt->params, sizeof(opt->params)); - file->read_raw(&opt->nx, sizeof(opt->nx)); - ggml_opt_init(ctx, opt, opt->params, opt->nx); - - file->read_raw(&opt->iter, sizeof(opt->iter)); - opt->just_initialized = (bool) file->read_u32(); +void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) { + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past); + gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ITERATION_COUNT, opt->iter); + gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); switch (opt->params.type) { case GGML_OPT_ADAM: { - read_tensor(file, opt->adam.x); - read_tensor(file, opt->adam.g1); - read_tensor(file, opt->adam.g2); - read_tensor(file, opt->adam.m); - read_tensor(file, opt->adam.v); - read_tensor(file, opt->adam.mh); - read_tensor(file, opt->adam.vh); - if (opt->adam.pf) { read_tensor(file, opt->adam.pf); } - file->read_raw(&opt->adam.fx_best, sizeof(opt->adam.fx_best)); - file->read_raw(&opt->adam.fx_prev, sizeof(opt->adam.fx_prev)); - file->read_raw(&opt->adam.n_no_improvement, sizeof(opt->adam.n_no_improvement)); + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, opt->adam.fx_prev); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, opt->adam.n_no_improvement); + + ggml_set_name(opt->adam.m, LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS); + ggml_set_name(opt->adam.v, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); + if (opt->adam.pf) { + ggml_set_name(opt->adam.pf, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); + } + + gguf_add_tensor(fctx, opt->adam.m); + gguf_add_tensor(fctx, opt->adam.v); + if (opt->adam.pf) { + gguf_add_tensor(fctx, opt->adam.pf); + } } break; case GGML_OPT_LBFGS: { - GGML_ASSERT(opt->adam.x != NULL); - read_tensor(file, opt->lbfgs.x); - read_tensor(file, opt->lbfgs.xp); - read_tensor(file, opt->lbfgs.g); - read_tensor(file, opt->lbfgs.gp); - read_tensor(file, opt->lbfgs.d); - if (opt->lbfgs.pf) { read_tensor(file, opt->lbfgs.pf); } - read_tensor(file, opt->lbfgs.lmal); - read_tensor(file, opt->lbfgs.lmys); - read_tensor(file, opt->lbfgs.lms); - read_tensor(file, opt->lbfgs.lmy); - file->read_raw(&opt->lbfgs.fx_best, sizeof(opt->lbfgs.fx_best)); - file->read_raw(&opt->lbfgs.step, sizeof(opt->lbfgs.step)); - file->read_raw(&opt->lbfgs.j, sizeof(opt->lbfgs.j)); - file->read_raw(&opt->lbfgs.k, sizeof(opt->lbfgs.k)); - file->read_raw(&opt->lbfgs.end, sizeof(opt->lbfgs.end)); - file->read_raw(&opt->lbfgs.n_no_improvement, sizeof(opt->lbfgs.n_no_improvement)); + gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, opt->lbfgs.fx_best); + gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, opt->lbfgs.step); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, opt->lbfgs.j); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, opt->lbfgs.k); + gguf_set_val_i32(fctx, LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, opt->lbfgs.end); + gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, opt->lbfgs.n_no_improvement); + + ggml_set_name(opt->lbfgs.x, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS); + ggml_set_name(opt->lbfgs.xp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS); + ggml_set_name(opt->lbfgs.g, LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS); + ggml_set_name(opt->lbfgs.gp, LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS); + ggml_set_name(opt->lbfgs.d, LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION); + if (opt->lbfgs.pf) { + ggml_set_name(opt->lbfgs.pf, LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES); + } + ggml_set_name(opt->lbfgs.lmal, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA); + ggml_set_name(opt->lbfgs.lmys, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS); + ggml_set_name(opt->lbfgs.lms, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S); + ggml_set_name(opt->lbfgs.lmy, LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y); + + gguf_add_tensor(fctx, opt->lbfgs.x); + gguf_add_tensor(fctx, opt->lbfgs.xp); + gguf_add_tensor(fctx, opt->lbfgs.g); + gguf_add_tensor(fctx, opt->lbfgs.gp); + gguf_add_tensor(fctx, opt->lbfgs.d); + if (opt->lbfgs.pf) { + gguf_add_tensor(fctx, opt->lbfgs.pf); + } + gguf_add_tensor(fctx, opt->lbfgs.lmal); + gguf_add_tensor(fctx, opt->lbfgs.lmys); + gguf_add_tensor(fctx, opt->lbfgs.lms); + gguf_add_tensor(fctx, opt->lbfgs.lmy); } break; } } -void save_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename) { - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; +void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) { + // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read + std::string arch; + + std::vector keybuf; + keybuf.resize(512); + auto kv = [&arch, &keybuf](const char * key) -> const char * { + snprintf(keybuf.data(), keybuf.size(), key, arch.c_str()); + return keybuf.data(); + }; + + std::vector tn_buf; + tn_buf.resize(GGML_MAX_NAME); + auto tn = [&tn_buf](const char * key) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); + return tn_buf.data(); + }; + auto tni = [&tn_buf](const char * key, int bid) -> const char * { + snprintf(tn_buf.data(), tn_buf.size(), key, bid); + std::string s = tn_buf.data(); + snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); + return tn_buf.data(); + }; + + GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); + GGML_ASSERT(arch == "llama"); + + uint32_t ftype_u; + GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); + GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); + + // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here + GGUF_GET_KEY(fctx, model->hparams.n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); + + GGUF_GET_KEY(fctx, model->hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); + GGUF_GET_KEY(fctx, model->hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); + GGUF_GET_KEY(fctx, model->hparams.n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); + GGUF_GET_KEY(fctx, model->hparams.n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); + + model->hparams.n_rot = model->hparams.n_embd / model->hparams.n_head; + GGUF_GET_KEY(fctx, model->hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT)); + + float rope_freq_scale = 1.0f; + GGUF_GET_KEY(fctx, model->hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); + GGUF_GET_KEY(fctx, model->hparams.rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); + GGUF_GET_KEY(fctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); + if (rope_freq_scale != 1.0f) { + model->hparams.rope_freq_scale = 1.0f / rope_freq_scale; } - const uint32_t magic = 'ggcp'; - const uint32_t version = 0; + init_model(model); - file.write_u32(magic); - file.write_u32(version); - file.write_u32(model->train_its); - file.write_u32(model->train_samples); - file.write_u32(model->train_tokens); - file.write_u32(model->hparams.n_vocab); - file.write_u32(model->hparams.n_embd); - file.write_u32(model->hparams.n_mult); - file.write_u32(model->hparams.n_head); - file.write_u32(model->hparams.n_layer); - file.write_u32(model->hparams.n_rot); - - write_tensor(&file, model->tok_embeddings); - write_tensor(&file, model->norm); - write_tensor(&file, model->output); + read_tensor_by_name(model->tok_embeddings, f_ggml_ctx, tn(LLM_TENSOR_TOKEN_EMBD)); + read_tensor_by_name(model->norm, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT_NORM)); + read_tensor_by_name(model->output, f_ggml_ctx, tn(LLM_TENSOR_OUTPUT)); for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { auto & layer = model->layers[i]; - write_tensor(&file, layer.attention_norm); - write_tensor(&file, layer.wq); - write_tensor(&file, layer.wk); - write_tensor(&file, layer.wv); - write_tensor(&file, layer.wo); - write_tensor(&file, layer.ffn_norm); - write_tensor(&file, layer.w1); - write_tensor(&file, layer.w2); - write_tensor(&file, layer.w3); + read_tensor_by_name(layer.attention_norm, f_ggml_ctx, tni(LLM_TENSOR_ATTN_NORM, i)); + read_tensor_by_name(layer.wq, f_ggml_ctx, tni(LLM_TENSOR_ATTN_Q, i)); + read_tensor_by_name(layer.wk, f_ggml_ctx, tni(LLM_TENSOR_ATTN_K, i)); + read_tensor_by_name(layer.wv, f_ggml_ctx, tni(LLM_TENSOR_ATTN_V, i)); + read_tensor_by_name(layer.wo, f_ggml_ctx, tni(LLM_TENSOR_ATTN_OUT, i)); + read_tensor_by_name(layer.ffn_norm, f_ggml_ctx, tni(LLM_TENSOR_FFN_NORM, i)); + read_tensor_by_name(layer.w1, f_ggml_ctx, tni(LLM_TENSOR_FFN_GATE, i)); + read_tensor_by_name(layer.w2, f_ggml_ctx, tni(LLM_TENSOR_FFN_DOWN, i)); + read_tensor_by_name(layer.w3, f_ggml_ctx, tni(LLM_TENSOR_FFN_UP, i)); } - - write_opt_context(&file, opt); } -bool load_checkpoint(struct my_llama_model * model, struct ggml_opt_context * opt, const char * filename, bool init) { - struct llama_file file(filename, "rb"); +void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) { + const char * arch = "llama"; + enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; - uint32_t magic; - uint32_t version; + std::vector keybuf; + keybuf.resize(512); + auto kv = [arch, &keybuf](const char * key) -> const char * { + snprintf(keybuf.data(), keybuf.size(), key, arch); + return keybuf.data(); + }; - uint32_t train_its = 0; - uint32_t train_samples = 0; - uint32_t train_tokens = 0; + // set arch + gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); + gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); - if (file.fp) { - printf("%s: Loading model from '%s'.\n", __func__, filename); - magic = file.read_u32(); - GGML_ASSERT(magic == 'ggcp'); - version = file.read_u32(); - GGML_ASSERT(version == 0); - train_its = file.read_u32(); - train_samples = file.read_u32(); - train_tokens = file.read_u32(); - model->hparams.n_vocab = file.read_u32(); - model->hparams.n_embd = file.read_u32(); - model->hparams.n_mult = file.read_u32(); - model->hparams.n_head = file.read_u32(); - model->hparams.n_layer = file.read_u32(); - model->hparams.n_rot = file.read_u32(); - print_params(&model->hparams); - } + // set hparams + gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx ); + gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd ); + gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff ); + gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head ); + gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer ); + gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_rot ); - if (init) { - init_model(model); - } + gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps ); + gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base ); // TODO load in llama.cpp + gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), 1.0f / model->hparams.rope_freq_scale ); - if (file.fp) { - model->train_its = train_its; - model->train_samples = train_samples; - model->train_tokens = train_tokens; - } + // set vocab by copying from vocab_model gguf file + { + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ NULL, + }; + struct gguf_context * vctx = gguf_init_from_file(fn_vocab_model, params); - printf("%s: Training iterations: %u.\n", __func__, model->train_its); - printf("%s: Training samples: %u.\n", __func__, model->train_samples); - printf("%s: Training tokens: %u.\n", __func__, model->train_tokens); + const int token_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_LIST)); + if (token_idx == -1) { + throw std::runtime_error("cannot find tokenizer vocab in model file\n"); + } + const uint32_t n_vocab = gguf_get_arr_n(vctx, token_idx); - if (file.fp) { - read_tensor(&file, model->tok_embeddings); - read_tensor(&file, model->norm); - read_tensor(&file, model->output); - - for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - read_tensor(&file, layer.attention_norm); - read_tensor(&file, layer.wq); - read_tensor(&file, layer.wk); - read_tensor(&file, layer.wv); - read_tensor(&file, layer.wo); - read_tensor(&file, layer.ffn_norm); - read_tensor(&file, layer.w1); - read_tensor(&file, layer.w2); - read_tensor(&file, layer.w3); + const int score_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_SCORES)); + if (score_idx == -1) { + throw std::runtime_error("cannot find tokenizer scores in model file\n"); } - read_opt_context(&file, model->ctx, opt); + const float * scores = (const float * ) gguf_get_arr_data(vctx, score_idx); + + const int toktype_idx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE)); + if (toktype_idx == -1) { + throw std::runtime_error("cannot find token type list in GGUF file\n"); + } + + const int * toktypes = (const int * ) gguf_get_arr_data(vctx, toktype_idx); + + std::string tokenizer_name; + GGUF_GET_KEY(vctx, tokenizer_name, gguf_get_val_str, GGUF_TYPE_STRING, true, kv(LLM_KV_TOKENIZER_MODEL)); + + gguf_set_val_str(fctx, kv(LLM_KV_TOKENIZER_MODEL), tokenizer_name.c_str()); + gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_SCORES), GGUF_TYPE_FLOAT32, scores, n_vocab); + gguf_set_arr_data(fctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE), GGUF_TYPE_INT32, toktypes, n_vocab); + + int32_t special_bos_id = 1; + int32_t special_eos_id = 2; + int32_t special_unk_id = 0; + int32_t special_sep_id = -1; + int32_t special_pad_id = -1; + if (tokenizer_name == "llama") { + // default special tokens + special_bos_id = 1; + special_eos_id = 2; + special_unk_id = 0; + special_sep_id = -1; + special_pad_id = -1; + } else if (tokenizer_name == "gpt2") { + // read and copy bpe merges + const int merges_keyidx = gguf_find_key(vctx, kv(LLM_KV_TOKENIZER_MERGES)); + if (merges_keyidx == -1) { + throw std::runtime_error("cannot find tokenizer merges in model file\n"); + } + + const int n_merges = gguf_get_arr_n(vctx, merges_keyidx); + + std::vector merges; + merges.resize(n_merges); + for (int i = 0; i < n_merges; i++) { + merges[i] = gguf_get_arr_str(vctx, merges_keyidx, i); + } + gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_MERGES), merges.data(), n_merges); + + // default special tokens + special_bos_id = 11; + special_eos_id = 11; + special_unk_id = -1; + special_sep_id = -1; + special_pad_id = -1; + } else { + fprintf(stderr, "%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); + fprintf(stderr, "%s: using default tokenizer: 'llama'", __func__); + } + + std::vector tokens; + tokens.resize(n_vocab); + for (uint32_t i = 0; i < n_vocab; i++) { + tokens[i] = gguf_get_arr_str(vctx, token_idx, i); + } + gguf_set_arr_str(fctx, kv(LLM_KV_TOKENIZER_LIST), tokens.data(), n_vocab); + + GGUF_GET_KEY(vctx, special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); + GGUF_GET_KEY(vctx, special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID)); + GGUF_GET_KEY(vctx, special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID)); + GGUF_GET_KEY(vctx, special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID)); + GGUF_GET_KEY(vctx, special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID)); + + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_BOS_ID), special_bos_id); + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_EOS_ID), special_eos_id); + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_UNK_ID), special_unk_id); + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_SEP_ID), special_sep_id); + gguf_set_val_u32(fctx, kv(LLM_KV_TOKENIZER_PAD_ID), special_pad_id); + + gguf_free(vctx); } - return (file.fp != NULL); + // add tensors + gguf_add_tensor(fctx, model->tok_embeddings); + gguf_add_tensor(fctx, model->norm); + gguf_add_tensor(fctx, model->output); + for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { + auto & layer = model->layers[i]; + + + gguf_add_tensor(fctx, layer.attention_norm); + gguf_add_tensor(fctx, layer.wq); + gguf_add_tensor(fctx, layer.wk); + gguf_add_tensor(fctx, layer.wv); + gguf_add_tensor(fctx, layer.wo); + gguf_add_tensor(fctx, layer.ffn_norm); + gguf_add_tensor(fctx, layer.w1); + gguf_add_tensor(fctx, layer.w2); + gguf_add_tensor(fctx, layer.w3); + } } -void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, const char * filename) { - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; +void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) { + struct gguf_context * fctx = gguf_init_empty(); + + save_llama_model_gguf(fctx, fn_vocab_model, model); + + // write file + const bool only_meta = false; + gguf_write_to_file(fctx, filename, only_meta); + gguf_free(fctx); +} + +void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) { + load_llama_model_gguf(fctx, f_ggml_ctx, model); + + uint32_t file_version; + GGUF_GET_KEY(fctx, file_version, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_FILE_VERSION); + GGML_ASSERT(file_version == 0); + + GGUF_GET_KEY(fctx, model->train_its, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_ITERATION_COUNT); + GGUF_GET_KEY(fctx, model->train_samples, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_SAMPLE_COUNT); + GGUF_GET_KEY(fctx, model->train_tokens, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_TOKEN_COUNT); + + load_opt_context_gguf(fctx, f_ggml_ctx, opt); +} + +void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) { + save_llama_model_gguf(fctx, fn_vocab_model, model); + + gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 0); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_ITERATION_COUNT, model->train_its); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_SAMPLE_COUNT, model->train_samples); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_TOKEN_COUNT, model->train_tokens); + + save_opt_context_gguf(fctx, opt); +} + +bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) { + struct ggml_context * f_ggml_ctx; + struct gguf_init_params params; + params.no_alloc = false; + params.ctx = &f_ggml_ctx; + struct gguf_context * fctx = gguf_init_from_file(filename, params); + if (fctx == NULL) { + return false; } -#pragma message("TODO: implement file saving using gguf") - (void) vocab; - (void) model; -// // write_magic -// file.write_u32(LLAMA_FILE_MAGIC); // magic -// file.write_u32(LLAMA_FILE_VERSION); // version -// // write_hparams -// file.write_u32(model->hparams.n_vocab); -// file.write_u32(model->hparams.n_embd); -// file.write_u32(model->hparams.n_mult); -// file.write_u32(model->hparams.n_head); -// file.write_u32(model->hparams.n_layer); -// file.write_u32(model->hparams.n_rot); -// file.write_u32(LLAMA_FTYPE_ALL_F32); -// // write_vocab -// uint32_t n_vocab = model->hparams.n_vocab; -// for (uint32_t i = 0; i < n_vocab; i++) { -// const auto & token_data = vocab->id_to_token.at(i); -// file.write_u32((uint32_t) token_data.tok.size()); -// file.write_raw(token_data.tok.data(), token_data.tok.size()); -// file.write_raw(&token_data.score, sizeof(token_data.score)); -// } -// // write tensors -// write_tensor(&file, model->tok_embeddings); -// write_tensor(&file, model->norm); -// write_tensor(&file, model->output); -// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { -// auto & layer = model->layers[i]; -// -// write_tensor(&file, layer.attention_norm); -// write_tensor(&file, layer.wq); -// write_tensor(&file, layer.wk); -// write_tensor(&file, layer.wv); -// write_tensor(&file, layer.wo); -// write_tensor(&file, layer.ffn_norm); -// write_tensor(&file, layer.w1); -// write_tensor(&file, layer.w2); -// write_tensor(&file, layer.w3); -// } + load_checkpoint_gguf(fctx, f_ggml_ctx, model, opt); + + return true; } -float cosine_decay(const int decay_steps, const float alpha, int step) { +void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) { + struct gguf_context * fctx = gguf_init_empty(); + + save_checkpoint_gguf(fctx, fn_vocab_model, model, opt); + + // write file + const bool only_meta = false; + gguf_write_to_file(fctx, filename, only_meta); + gguf_free(fctx); +} + +float cosine_decay(const int decay_steps, const float minimum, int step) { if (step > decay_steps) { step = decay_steps; } const float cosine_decay = 0.50f*(1.0f + cosf(3.14159265359f*step/decay_steps)); - const float decay = (1 - alpha)*cosine_decay + alpha; + const float decay = (1 - minimum)*cosine_decay + minimum; return decay; } -float cosine_decay_restart(int decay_steps, const float alpha, int step, float restart_step_mult) { - while (step > decay_steps) { - step -= decay_steps; - decay_steps = (int) restart_step_mult * decay_steps; +float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) { + if (enable_restart) { + while (step > decay_steps) { + step -= decay_steps; + decay_steps = (int) restart_step_mult * decay_steps; + } } - return cosine_decay(decay_steps, alpha, step); + return cosine_decay(decay_steps, minimum, step); } struct train_params { @@ -2683,39 +1513,51 @@ struct train_params { int n_ctx; int n_embd; - int n_mult; int n_head; int n_layer; - int n_rotmax; + int n_ff; int n_threads; int n_batch; int n_examples; - int n_predict; + + float f_norm_rms_eps; + float rope_freq_base; + float rope_freq_scale; int print_info_interval; - int print_details_interval; bool samples_start_after_nl; bool use_adam; bool use_flash; - bool use_scratch; + bool use_checkpointing; + bool use_alloc; // only adam int warmup; int cos_decay_steps; float cos_decay_restart; - float cos_decay_alpha; + float cos_decay_min; + bool enable_restart; + + int opt_past; + float opt_delta; + int opt_max_no_improvement; int lbfgs_n_iter; int adam_n_iter; float adam_alpha; + float adam_min_alpha; float adam_decay; + int adam_decay_min_ndim; + float adam_beta1; + float adam_beta2; + float adam_gclip; + float adam_eps_f; int mem_model_gb; int mem_compute_gb; int mem_compute0_gb; - int mem_compute1_gb; }; struct train_params get_default_train_params() { @@ -2730,40 +1572,51 @@ struct train_params get_default_train_params() { params.n_ctx = 128; params.n_embd = 256; - params.n_mult = 256; params.n_head = 8; params.n_layer = 16; - params.n_rotmax = 64; + params.n_ff = 768; params.n_threads = 6; params.n_batch = 8; - params.n_examples = 8; - params.n_predict = 1024; + params.n_examples = 1; + + params.f_norm_rms_eps = 1e-5; + params.rope_freq_base = 10000.0f; + params.rope_freq_scale = 1.0f; params.print_info_interval = 1; - params.print_details_interval = 2; params.samples_start_after_nl = false; params.use_adam = true; params.use_flash = true; - params.use_scratch = true; + params.use_checkpointing = true; + params.use_alloc = true; + + params.opt_past = 0; + params.opt_delta = 1e-5f; + params.opt_max_no_improvement = 0; // only adam params.warmup = 100; params.cos_decay_steps = 1000; params.cos_decay_restart = 1.1f; - params.cos_decay_alpha = 0.0f; + params.cos_decay_min = 0.1f; + params.enable_restart = false; - params.lbfgs_n_iter = 16; - params.adam_n_iter = 16; - params.adam_alpha = 1e-3f; - params.adam_decay = 1e-3f; + params.lbfgs_n_iter = 256; + params.adam_n_iter = 256; + params.adam_alpha = 1e-3f; + params.adam_min_alpha = 0; + params.adam_decay = 1e-1f; + params.adam_decay_min_ndim = 2; + params.adam_beta1 = 0.9f; + params.adam_beta2 = 0.999f; + params.adam_gclip = 1.0f; + params.adam_eps_f = 0.0f; - params.mem_model_gb = 2; + params.mem_model_gb = 2; params.mem_compute_gb = 24; params.mem_compute0_gb = 8; - params.mem_compute1_gb = 2; - return params; } @@ -2780,35 +1633,47 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for -1)\n"); fprintf(stderr, " -c N, --ctx N Context size used during training (default %d)\n", params->n_ctx); fprintf(stderr, " --embd N Embedding size used for new models (default %d)\n", params->n_embd); - fprintf(stderr, " --mult N Mult size used for new models, influences feedforward size. (default %d)\n", params->n_mult); + fprintf(stderr, " --ff N Feedforward size used for new models. (default %d)\n", params->n_ff); fprintf(stderr, " --head N Number of heads for new models (default %d)\n", params->n_head); fprintf(stderr, " --layer N Number of layers for new models (default %d)\n", params->n_layer); - fprintf(stderr, " --rotmax N Maximal number Rope dimensions for new models (default %d)\n", params->n_rotmax); + fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); + fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); + fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); fprintf(stderr, " -n N, --examples N Number of examples to train (default %d)\n", params->n_examples); - fprintf(stderr, " --predict N Number of tokens to generate after training (default %d)\n", params->n_predict); fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval); - fprintf(stderr, " --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval); fprintf(stderr, " --samples-after-nl Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off"); fprintf(stderr, " --use-lbfgs Use LBFGS optimizer instead of default Adam\n"); fprintf(stderr, " --use-adam Use Adam optimizer (default)\n"); - fprintf(stderr, " --no-flash Don't use flash attention.\n"); + fprintf(stderr, " --no-flash Don't use flash attention \n"); fprintf(stderr, " --use-flash Use flash attention (default)\n"); - fprintf(stderr, " --no-scratch Don't use scratch buffers\n"); - fprintf(stderr, " --use-scratch Use scratch buffers (default)\n"); - fprintf(stderr, " --warmup N Number of warmup steps (default %d)\n", params->warmup); - fprintf(stderr, " --cos-decay-steps N Number of cosine decay steps (default %d)\n", params->cos_decay_steps); - fprintf(stderr, " --cos-decay-restart N Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); - fprintf(stderr, " --cos-decay-alpha N Cosine decay alpha (default %f)\n", params->cos_decay_alpha); - fprintf(stderr, " --lbfgs-iter N Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter); + fprintf(stderr, " --no-checkpointing Don't use gradient checkpointing\n"); + fprintf(stderr, " --use-checkpointing Use gradient checkpointing (default)\n"); + fprintf(stderr, " --no-alloc Don't use allocator\n"); + fprintf(stderr, " --use-alloc Use allocator (default)\n"); + fprintf(stderr, " --warmup N Only for Adam optimizer. Number of warmup steps (default %d)\n", params->warmup); + fprintf(stderr, " --cos-decay-steps N Only for Adam optimizer. Number of cosine decay steps (default %d)\n", params->cos_decay_steps); + fprintf(stderr, " --cos-decay-restart N Only for Adam optimizer. Increase of cosine decay steps after restart (default %f)\n", params->cos_decay_restart); + fprintf(stderr, " --cos-decay-min N Only for Adam optimizer. Cosine decay minimum (default %f)\n", params->cos_decay_min); + fprintf(stderr, " --enable-restart N Only for Adam optimizer. Enable restarts of cos-decay %s\n", params->enable_restart ? "(default)" : ""); + fprintf(stderr, " --disable-restart N Only for Adam optimizer. Disable restarts of cos-decay %s\n", !params->enable_restart ? "(default)" : ""); + fprintf(stderr, " --opt-past N Number of optimization iterations to track for delta convergence test. Disabled when zero. (default %d)\n", params->opt_past); + fprintf(stderr, " --opt-delta N Maximum delta for delta convergence test. Disabled when <= zero. (default %f)\n", params->opt_delta); + fprintf(stderr, " --opt-max-no-improvement N Maximum number of optimization iterations with no improvement. Disabled when <= zero. (default %d)\n", params->opt_max_no_improvement); + fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f); fprintf(stderr, " --adam-iter N Maximum number of Adam optimization iterations for each batch (default %d)\n", params->adam_n_iter); fprintf(stderr, " --adam-alpha N Adam learning rate alpha (default %f)\n", params->adam_alpha); + fprintf(stderr, " --adam-min-alpha N Adam minimum learning rate alpha - including warmup phase (default %f)\n", params->adam_min_alpha); fprintf(stderr, " --adam-decay N AdamW weight decay. Values greater zero enable AdamW instead of regular Adam. (default %f)\n", params->adam_decay); + fprintf(stderr, " --adam-decay-min-ndim N Minimum number of tensor dimensions to apply AdamW weight decay. Weight decay is not applied to tensors with less n_dims. (default %d)\n", params->adam_decay_min_ndim); + fprintf(stderr, " --adam-beta1 N AdamW beta1 in interval [0,1). How much to smooth the first moment of gradients. (default %f)\n", params->adam_beta1); + fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2); + fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip); + fprintf(stderr, " --lbfgs-iter N Maximum number of LBFGS optimization iterations for each batch (default %d)\n", params->lbfgs_n_iter); fprintf(stderr, " --mem-model N Memory to allocate for model and cache in gigabytes. (default %d)\n", params->mem_model_gb); fprintf(stderr, " --mem-compute N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute_gb); - fprintf(stderr, " --mem-compute0 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute0_gb); - fprintf(stderr, " --mem-compute1 N Memory to allocate for compute in gigabytes. (default %d)\n", params->mem_compute1_gb); + fprintf(stderr, " --mem-compute0 N Memory to allocate for automatic memory allocator in gigabytes. (default %d)\n", params->mem_compute0_gb); fprintf(stderr, "\n"); } @@ -2872,12 +1737,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->n_embd = std::stoi(argv[i]); - } else if (arg == "--mult") { + } else if (arg == "--ff") { if (++i >= argc) { invalid_param = true; break; } - params->n_mult = std::stoi(argv[i]); + params->n_ff = std::stoi(argv[i]); } else if (arg == "--head") { if (++i >= argc) { invalid_param = true; @@ -2890,12 +1755,24 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->n_layer = std::stoi(argv[i]); - } else if (arg == "--rotmax") { + } else if (arg == "--norm-rms-eps") { if (++i >= argc) { invalid_param = true; break; } - params->n_rotmax = std::stoi(argv[i]); + params->f_norm_rms_eps = std::stof(argv[i]); + } else if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->rope_freq_base = std::stof(argv[i]); + } else if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->rope_freq_scale = std::stof(argv[i]); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -2914,24 +1791,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->n_examples = std::stoi(argv[i]); - } else if (arg == "--predict") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_predict = std::stoi(argv[i]); } else if (arg == "--print-info-interval") { if (++i >= argc) { invalid_param = true; break; } params->print_info_interval = std::stoi(argv[i]); - } else if (arg == "--print-details-interval") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->print_details_interval = std::stoi(argv[i]); } else if (arg == "--samples-after-nl") { params->samples_start_after_nl = true; } else if (arg == "--use-lbfgs") { @@ -2942,10 +1807,14 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { params->use_flash = false; } else if (arg == "--use-flash") { params->use_flash = true; - } else if (arg == "--no-scratch") { - params->use_scratch = false; - } else if (arg == "--use-scratch") { - params->use_scratch = true; + } else if (arg == "--no-checkpointing") { + params->use_checkpointing = false; + } else if (arg == "--use-checkpointing") { + params->use_checkpointing = true; + } else if (arg == "--no-alloc") { + params->use_alloc = false; + } else if (arg == "--use-alloc") { + params->use_alloc = true; } else if (arg == "--warmup") { if (++i >= argc) { invalid_param = true; @@ -2964,18 +1833,40 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->cos_decay_restart = std::stof(argv[i]); - } else if (arg == "--cos-decay-alpha") { + } else if (arg == "--cos-decay-min") { if (++i >= argc) { invalid_param = true; break; } - params->cos_decay_alpha = std::stof(argv[i]); - } else if (arg == "--lbfgs-iter") { + params->cos_decay_min = std::stof(argv[i]); + } else if (arg == "--enable-restart") { + params->enable_restart = true; + } else if (arg == "--disable-restart") { + params->enable_restart = false; + } else if (arg == "--opt-past") { if (++i >= argc) { invalid_param = true; break; } - params->lbfgs_n_iter = std::stoi(argv[i]); + params->opt_past = std::stoi(argv[i]); + } else if (arg == "--opt-delta") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->opt_delta = std::stof(argv[i]); + } else if (arg == "--opt-max-no-improvement") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->opt_max_no_improvement = std::stoi(argv[i]); + } else if (arg == "--adam-epsf") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_eps_f = std::stof(argv[i]); } else if (arg == "--adam-iter") { if (++i >= argc) { invalid_param = true; @@ -2988,12 +1879,48 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->adam_alpha = std::stof(argv[i]); + } else if (arg == "--adam-min-alpha") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_min_alpha = std::stof(argv[i]); } else if (arg == "--adam-decay") { if (++i >= argc) { invalid_param = true; break; } params->adam_decay = std::stof(argv[i]); + } else if (arg == "--adam-decay-min-ndim") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_decay_min_ndim = std::stoi(argv[i]); + } else if (arg == "--adam-beta1") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_beta1 = std::stof(argv[i]); + } else if (arg == "--adam-beta2") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_beta2 = std::stof(argv[i]); + } else if (arg == "--adam-gclip") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->adam_gclip = std::stof(argv[i]); + } else if (arg == "--lbfgs-iter") { + if (++i >= argc) { + invalid_param = true; + break; + } + params->lbfgs_n_iter = std::stoi(argv[i]); } else if (arg == "--mem-model") { if (++i >= argc) { invalid_param = true; @@ -3012,12 +1939,6 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->mem_compute0_gb = std::stoi(argv[i]); - } else if (arg == "--mem-compute1") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->mem_compute1_gb = std::stoi(argv[i]); } else if (arg == "-h" || arg == "--help") { train_print_usage(argc, argv, &default_params); exit(0); @@ -3036,6 +1957,63 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { return true; } +struct opt_callback_data { + struct train_params * params; + struct ggml_opt_context * opt; + struct llama_context * lctx; + llama_token * tokens_data; + size_t tokens_size; + int * samples_data; + size_t samples_size; + int shuffle_countdown; + struct ggml_tensor * tokens_input; + struct ggml_tensor * target_logits; + struct ggml_tensor * target_probs; +}; + +void opt_callback(void * vdata, float * sched) { + struct opt_callback_data * data = (struct opt_callback_data *) vdata; + struct train_params * params = data->params; + struct ggml_opt_context * opt = data->opt; + int n_batch = params->n_batch; + + *sched = (opt->iter < params->warmup) + ? (float) opt->iter / (float) params->warmup + : cosine_decay_restart( + params->cos_decay_steps, + params->cos_decay_min, + opt->iter - params->warmup, + params->cos_decay_restart, + params->enable_restart); + float min_sched = params->adam_min_alpha / params->adam_alpha; + *sched = min_sched + *sched * (1.0f - min_sched); + + int impr_plot = std::isnan(opt->loss_after) ? 0 : -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f); + printf("%s: iter=%*d, sched=%f loss0=%f loss=%f | improvement: %*d>\n", __func__, 6, opt->iter, *sched, opt->loss_before, opt->loss_after, impr_plot, (int)0); + + if (data->shuffle_countdown < n_batch) { + printf("%s: reshuffle samples\n", __func__); + shuffle_ints(data->samples_data, data->samples_data + data->samples_size); + for (int i = 0; i < (int) data->samples_size; ++i) { + GGML_ASSERT(data->samples_data[i]+params->n_ctx-1 < (int) data->tokens_size); + } + data->shuffle_countdown = data->samples_size; + } + + get_example_targets_batch( + data->lctx, + data->samples_data, + data->samples_size, + data->tokens_data, + data->tokens_size, + opt->iter, + data->tokens_input, + data->target_logits, + data->target_probs); + + data->shuffle_countdown -= n_batch; +} + int main(int argc, char ** argv) { struct train_params params = get_default_train_params(); @@ -3055,18 +2033,6 @@ int main(int argc, char ** argv) { struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params); struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); - struct llama_vocab vocab; - { - const int n_vocab = llama_n_vocab(lctx); - vocab.id_to_token.resize(n_vocab); - for (int i=0; i train_tokens; if (tokenize_file(lctx, params.fn_train_data, train_tokens) < 0) { @@ -3078,10 +2044,14 @@ int main(int argc, char ** argv) { model.hparams.n_vocab = llama_n_vocab(lctx); model.hparams.n_ctx = params.n_ctx; model.hparams.n_embd = params.n_embd; - model.hparams.n_mult = params.n_mult; model.hparams.n_head = params.n_head; model.hparams.n_layer = params.n_layer; - model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + model.hparams.n_ff = params.n_ff; + // llama.cpp requires n_rot to be exactly n_embd / n_head + model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head; + model.hparams.f_norm_rms_eps = params.f_norm_rms_eps; + model.hparams.rope_freq_base = params.rope_freq_base; + model.hparams.rope_freq_scale = params.rope_freq_scale; print_params(&model.hparams); @@ -3103,19 +2073,12 @@ int main(int argc, char ** argv) { } printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); - struct my_llama_kv_cache kv_self; - - struct ggml_init_params lcparams; lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); lcparams.mem_buffer = NULL; lcparams.no_alloc = false; model.ctx = ggml_init(lcparams); - kv_self.ctx = model.ctx; - - my_llama_sampler sampler; - int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; @@ -3126,24 +2089,38 @@ int main(int argc, char ** argv) { struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); - opt_params_adam.print_forward_graph = false; + opt_params_adam.print_forward_graph = false; opt_params_adam.print_backward_graph = false; - opt_params_adam.n_threads = params.n_threads; - opt_params_adam.adam.n_iter = params.adam_n_iter; - opt_params_adam.adam.sched = 1.0f; - opt_params_adam.adam.alpha = params.adam_alpha; - opt_params_adam.adam.decay = params.adam_decay; + opt_params_adam.n_threads = params.n_threads; + opt_params_adam.past = params.opt_past; + opt_params_adam.delta = params.opt_delta; + opt_params_adam.max_no_improvement = params.opt_max_no_improvement; + opt_params_adam.adam.n_iter = params.adam_n_iter; + opt_params_adam.adam.sched = 1.0f; + opt_params_adam.adam.alpha = params.adam_alpha; + opt_params_adam.adam.decay = params.adam_decay; + opt_params_adam.adam.decay_min_ndim = params.adam_decay_min_ndim; + opt_params_adam.adam.beta1 = params.adam_beta1; + opt_params_adam.adam.beta2 = params.adam_beta2; + opt_params_adam.adam.gclip = params.adam_gclip; + opt_params_adam.adam.eps_f = params.adam_eps_f; - opt_params_lbfgs.print_forward_graph = false; + opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; - opt_params_lbfgs.n_threads = params.n_threads; - opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter; + opt_params_lbfgs.n_threads = params.n_threads; + opt_params_adam.past = params.opt_past; + opt_params_adam.delta = params.opt_delta; + opt_params_adam.max_no_improvement = params.opt_max_no_improvement; + opt_params_lbfgs.lbfgs.n_iter = params.lbfgs_n_iter; opt->ctx = model.ctx; opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; printf("%s: init model\n", __func__); - bool existed = load_checkpoint(&model, opt, params.fn_checkpoint_in, true); + bool existed = load_checkpoint_file(params.fn_checkpoint_in, &model, opt); + if (!existed) { + init_model(&model); + } set_param_model(&model); opt->params = params.use_adam ? opt_params_adam : opt_params_lbfgs; @@ -3156,11 +2133,7 @@ int main(int argc, char ** argv) { randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f); } - init_kv_cache(&kv_self, &model, 1); - // init_kv_cache(&kv_self, &model, n_batch); - init_sampler(&sampler, lctx); - - printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); + printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx)); // ggml_print_tensor_objects(model.ctx); // TODO: use std::vector intead of "new" @@ -3168,9 +2141,13 @@ int main(int argc, char ** argv) { uint8_t * compute_addr = new uint8_t[compute_size]; size_t size_buf_0 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute0_gb); - size_t size_buf_1 = 1024ll*1024ll*1024ll*((size_t) params.mem_compute1_gb); uint8_t * compute_buf_0 = new uint8_t[size_buf_0]; - uint8_t * compute_buf_1 = new uint8_t[size_buf_1]; + + ggml_allocr * alloc = NULL; + if (params.use_alloc) { + static const size_t tensor_alignment = 32; + alloc = ggml_allocr_new(compute_buf_0, size_buf_0, tensor_alignment); + } GGML_ASSERT(n_tokens < (int) train_tokens.size()); std::vector train_samples; @@ -3185,10 +2162,23 @@ int main(int argc, char ** argv) { GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); } - std::vector work_buffer; - printf("%s: begin training\n", __func__); + struct opt_callback_data opt_cb_data; + opt_cb_data.params = ¶ms; + opt_cb_data.opt = opt; + opt_cb_data.lctx = lctx; + opt_cb_data.tokens_data = train_tokens.data(); + opt_cb_data.tokens_size = train_tokens.size(); + opt_cb_data.samples_data = train_samples.data(); + opt_cb_data.samples_size = train_samples.size(); + opt_cb_data.shuffle_countdown = train_samples.size(); + opt_cb_data.tokens_input = NULL; + opt_cb_data.target_logits = NULL; + opt_cb_data.target_probs = NULL; + + int64_t t0 = ggml_time_ms(); + for (int ex = 0; ex < params.n_examples; ++ex) { if (ex*n_batch >= (int) train_samples.size()) { shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); @@ -3198,198 +2188,110 @@ int main(int argc, char ** argv) { } struct ggml_init_params cparams = { - /*.mem_size =*/ compute_size, - /*.mem_buffer =*/ compute_addr, - /*.no_alloc =*/ false, + compute_size, // mem_size + compute_addr, // mem_buffer + false, // no_alloc }; struct ggml_context * ctx0 = ggml_init(cparams); - struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); + ggml_set_no_alloc(ctx0, false); + + // don't use alloc for input tensors, so we can safely fill them with data + //struct ggml_tensor * after_opt_best_samples = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); //struct ggml_tensor * after_opt_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_batch); struct ggml_tensor * target_logits = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); + ggml_set_no_alloc(ctx0, (alloc != NULL)); + + if (alloc) { + ggml_allocr_reset(alloc); + } + + opt_cb_data.tokens_input = tokens_input; + opt_cb_data.target_logits = target_logits; + opt_cb_data.target_probs = target_probs; + int n_past = 0; - struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); - struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32) + (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); - - memset(gfbuf->data, 0, ggml_nbytes(gfbuf)); - memset(gbbuf->data, 0, ggml_nbytes(gbbuf)); - - struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; - struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; - - - get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gb = ggml_new_graph(ctx0); + struct ggml_cgraph * gb_tmp = params.use_checkpointing + ? ggml_new_graph(ctx0) + : NULL; GGML_ASSERT(n_past == 0); struct ggml_tensor * loss = NULL; struct ggml_tensor * logits = NULL; - if (params.use_scratch) { - loss = forward_batch_wo_cache_flash_attn_train( - &model, ctx0, - gf, gb, - &logits, tokens_input, target_probs, - compute_buf_0, compute_buf_1, - size_buf_0, size_buf_1, - n_tokens, n_batch); - } else if (params.use_flash) { - logits = forward_batch_wo_cache_flash_attn(&model, ctx0, gf, tokens_input, n_tokens, n_batch); - loss = cross_entropy_loss(ctx0, logits, target_probs); - ggml_build_forward_expand(gf, loss); - *gb = ggml_build_backward(ctx0, gf, true); - } else { - logits = forward_batch_wo_cache(&model, ctx0, gf, tokens_input, n_tokens, n_batch); - loss = cross_entropy_loss(ctx0, logits, target_probs); - ggml_build_forward_expand(gf, loss); - *gb = ggml_build_backward(ctx0, gf, true); - } - - ggml_graph_compute_helper(work_buffer, gf, params.n_threads); + loss = llama_build_train_graphs( + &model, alloc, ctx0, + gf, gb, gb_tmp, + &logits, tokens_input, target_probs, + n_tokens, n_batch, + params.use_flash, + params.use_checkpointing + ); size_t used_mem_before_opt = ggml_used_mem(ctx0); - float error_before_opt = ggml_get_f32_1d(loss, 0); - opt->params.adam.sched = (opt->iter < params.warmup) ? (float) opt->iter / (float) params.warmup : cosine_decay_restart( params.cos_decay_steps, - params.cos_decay_alpha, + params.cos_decay_min, opt->iter - params.warmup, - params.cos_decay_restart); + params.cos_decay_restart, + params.enable_restart); + + float min_sched = params.adam_min_alpha / params.adam_alpha; + opt->params.adam.sched = min_sched + opt->params.adam.sched * (1.0f - min_sched); printf("%s: opt->params.adam.sched %.5f\n", __func__, opt->params.adam.sched); - ggml_opt_resume_g(ctx0, opt, loss, gf, gb); + ggml_opt_resume_g(ctx0, opt, loss, gf, gb, &opt_callback, (void *) &opt_cb_data); size_t used_mem_after_opt = ggml_used_mem(ctx0); + int n_iter = params.use_adam ? params.adam_n_iter : params.lbfgs_n_iter; model.train_its = opt->iter; - model.train_samples += n_batch; - model.train_tokens += n_batch * n_tokens; - - ggml_graph_compute_helper(work_buffer, gf, params.n_threads); - - float error_after_opt = ggml_get_f32_1d(loss, 0); + model.train_samples += n_batch * n_iter; + model.train_tokens += n_batch * n_tokens * n_iter; if (params.print_info_interval > 0 && ex % params.print_info_interval == 0) { printf("Example %d, opt iter %d\n", ex, opt->iter); - printf("error_before_opt: %.6f\n", error_before_opt); - printf("error_after_opt: %.6f\n", error_after_opt); + printf("error_before_opt: %.6f\n", opt->loss_before); + printf("error_after_opt: %.6f\n", opt->loss_after); printf("used_mem_before_opt: %zu bytes\n", used_mem_before_opt); printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); } - if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) { - // set_logits_masked(logits, token_notavail, -1e9); - for (int i=0; idata + i*logits->nb[2] + k*logits->nb[1]), - (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), - k); - * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token; - } - } - - // printf("probabilities after optimization:\n"); - // print_matrix(after_opt_probs); - printf("Example:\n---\n"); - print_tokens_batch(lctx, tokens_input); - printf("\n---\n"); - - // printf("best samples after optimization:\n---\n"); - printf("samples after optimization:\n---\n"); - print_tokens_batch(lctx, after_opt_best_samples); - printf("\n---\n"); - } - ggml_free(ctx0); } + int64_t t1 = ggml_time_ms(); + int64_t d = t1-t0; + double dd = (double) d * 1e-3; + printf("%s: total training time=%f seconds\n", __func__, dd); + if (params.n_examples > 0) { - save_checkpoint(&model, opt, params.fn_checkpoint_out); + save_checkpoint_file(params.fn_checkpoint_out, params.fn_vocab_model, &model, opt); } if (strlen(params.fn_model_out) > 0) { - save_as_llama_model(&vocab, &model, params.fn_model_out); + save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model); } - { - int n_gen = params.n_predict; - int sample_ctx = n_tokens - n_tokens/8; - - sampler.params.temp = 0.2f; - sampler.params.repeat_penalty = 1.1f; - sampler.params.mirostat = 2; - init_sampler(&sampler, lctx); - - printf("Generating %d tokens.\n", n_gen); - - struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens); - struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - struct ggml_tensor * target_probs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - - get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs); - for (int i=sample_ctx; idata + (sample_ctx-1)*logits->nb[1]), - (llama_token *) tokens_input->data, - sample_ctx-1); - //int token = ggml_get_i32_1d(best_samples, sample_ctx-1); - - // print_row(probs, sample_at); - print_token(lctx, token); - - lshift_examples(tokens_input, target_logits, target_probs, 1); - ggml_set_i32_1d(tokens_input, 0, 0); - ggml_set_i32_1d(tokens_input, sample_ctx-1, token); - - ggml_free(ctx0); - } + if (alloc) { + ggml_allocr_free(alloc); } delete[] compute_addr; delete[] compute_buf_0; - delete[] compute_buf_1; - + ggml_free(model.ctx); llama_free(lctx); llama_free_model(lmodel); - ggml_free(model.ctx); - return 0; } diff --git a/flake.lock b/flake.lock index 33164e096..a7777d05d 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "systems": "systems" }, "locked": { - "lastModified": 1685518550, - "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=", + "lastModified": 1692799911, + "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=", "owner": "numtide", "repo": "flake-utils", - "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef", + "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1685931219, - "narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=", + "lastModified": 1692913444, + "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "7409480d5c8584a1a83c422530419efe4afb0d19", + "rev": "18324978d632ffc55ef1d928e81630c620f4f447", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 616b90252..02095411e 100644 --- a/flake.nix +++ b/flake.nix @@ -6,6 +6,9 @@ outputs = { self, nixpkgs, flake-utils }: flake-utils.lib.eachDefaultSystem (system: let + name = "llama.cpp"; + src = ./.; + meta.mainProgram = "llama"; inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin; buildInputs = with pkgs; [ openmpi ]; osSpecific = with pkgs; buildInputs ++ @@ -21,11 +24,17 @@ CoreGraphics CoreVideo ] + else if isDarwin then + with pkgs.darwin.apple_sdk.frameworks; [ + Accelerate + CoreGraphics + CoreVideo + ] else with pkgs; [ openblas ] ); pkgs = import nixpkgs { inherit system; }; - nativeBuildInputs = with pkgs; [ cmake pkgconfig ]; + nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ]; llama-python = pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); postPatch = '' @@ -38,35 +47,35 @@ mv $out/bin/server $out/bin/llama-server ''; cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; - in { + in + { packages.default = pkgs.stdenv.mkDerivation { - name = "llama.cpp"; - src = ./.; - postPatch = postPatch; - nativeBuildInputs = nativeBuildInputs; - buildInputs = osSpecific; + inherit name src meta postPatch nativeBuildInputs buildInputs postInstall; cmakeFlags = cmakeFlags ++ (if isAarch64 && isDarwin then [ - "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" - "-DLLAMA_METAL=ON" - ] else [ - "-DLLAMA_BLAS=ON" - "-DLLAMA_BLAS_VENDOR=OpenBLAS" + "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" + "-DLLAMA_METAL=ON" + ] else [ + "-DLLAMA_BLAS=ON" + "-DLLAMA_BLAS_VENDOR=OpenBLAS" ]); - postInstall = postInstall; - meta.mainProgram = "llama"; }; packages.opencl = pkgs.stdenv.mkDerivation { - name = "llama.cpp"; - src = ./.; - postPatch = postPatch; - nativeBuildInputs = nativeBuildInputs; + inherit name src meta postPatch nativeBuildInputs postInstall; buildInputs = with pkgs; buildInputs ++ [ clblast ]; cmakeFlags = cmakeFlags ++ [ "-DLLAMA_CLBLAST=ON" ]; - postInstall = postInstall; - meta.mainProgram = "llama"; + }; + packages.rocm = pkgs.stdenv.mkDerivation { + inherit name src meta postPatch nativeBuildInputs postInstall; + buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ]; + cmakeFlags = cmakeFlags ++ [ + "-DLLAMA_HIPBLAS=1" + "-DCMAKE_C_COMPILER=hipcc" + "-DCMAKE_CXX_COMPILER=hipcc" + "-DCMAKE_POSITION_INDEPENDENT_CODE=ON" + ]; }; apps.llama-server = { type = "app"; @@ -80,8 +89,13 @@ type = "app"; program = "${self.packages.${system}.default}/bin/llama"; }; + apps.quantize = { + type = "app"; + program = "${self.packages.${system}.default}/bin/quantize"; + }; apps.default = self.apps.${system}.llama; devShells.default = pkgs.mkShell { + buildInputs = [ llama-python ]; packages = nativeBuildInputs ++ osSpecific; }; }); diff --git a/ggml-alloc.c b/ggml-alloc.c index 1ef011654..f07a4a217 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -107,6 +107,10 @@ static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct g } void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { +#ifdef GGML_ALLOCATOR_DEBUG + GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources + GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated +#endif size_t size = ggml_allocator_get_alloc_size(alloc, tensor); size = aligned_offset(NULL, size, alloc->alignment); @@ -268,7 +272,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) /*.parse_seq = */ {0}, /*.parse_seq_len = */ 0, #ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ = {0}, + /*.allocated_tensors = */ {0}, #endif }; @@ -297,7 +301,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { /*.parse_seq = */ {0}, /*.parse_seq_len = */ 0, #ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ = {0}, + /*.allocated_tensors = */ {0}, #endif }; @@ -317,8 +321,7 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) { //////////// compute graph allocator static bool ggml_is_view(struct ggml_tensor * t) { - return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE || - t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY; + return t->view_src != NULL; } static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { @@ -336,28 +339,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml return true; } -static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) { - switch (t->op) { - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_VIEW: - return t->src[0]; - case GGML_OP_CPY: - return t->src[1]; - default: - return NULL; - } -} - -static struct ggml_tensor * get_view_source(struct ggml_tensor * t) { - struct ggml_tensor * parent = t; - do { - parent = get_view_parent(parent); - } while (ggml_is_view(parent)); - return parent; -} - static bool ggml_op_can_inplace(enum ggml_op op) { switch (op) { case GGML_OP_SCALE: @@ -365,7 +346,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_DIAG_MASK_INF: case GGML_OP_ADD: case GGML_OP_ADD1: - case GGML_OP_ACC: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: @@ -375,7 +355,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_UNARY: case GGML_OP_ROPE: case GGML_OP_RMS_NORM: - case GGML_OP_SET: case GGML_OP_SOFT_MAX: case GGML_OP_CONT: return true; @@ -389,24 +368,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (ggml_is_view(node)) { - size_t offset; - switch(node->op) { - case GGML_OP_VIEW: - memcpy(&offset, node->op_params, sizeof(size_t)); - node->data = (char *) node->src[0]->data + offset; - break; - case GGML_OP_PERMUTE: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - node->data = node->src[0]->data; - break; - case GGML_OP_CPY: - node->data = node->src[1]->data; - break; - default: - GGML_ASSERT(!"unknown view op"); - break; - } + assert(node->view_src->data != NULL); + node->data = (char *)node->view_src->data + node->view_offs; } else { // see if we can reuse a parent's buffer (inplace) if (ggml_op_can_inplace(node->op)) { @@ -426,7 +389,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) struct hash_node * p_hn = hash_get(ht, parent); if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); + struct ggml_tensor * view_src = parent->view_src; struct hash_node * view_src_hn = hash_get(ht, view_src); if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite @@ -468,7 +431,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( struct ggml_tensor * node = gf->nodes[i]; if (ggml_is_view(node)) { - struct ggml_tensor * view_src = get_view_source(node); + struct ggml_tensor * view_src = node->view_src; hash_get(ht, view_src)->n_views += 1; } @@ -553,10 +516,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n( if (p_hn->n_children == 0 && p_hn->n_views == 0) { if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); + struct ggml_tensor * view_src = parent->view_src; struct hash_node * view_src_hn = hash_get(ht, view_src); view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s\n", view_src->name); + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { ggml_allocator_free_tensor(alloc, view_src); } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 83d53c13c..5fd625630 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -306,11 +306,11 @@ typedef struct { #define QI4_K (QK_K / (4*QR4_K)) #ifdef GGML_QKK_64 typedef struct { - half d[2]; // super-block scales/mins + half dm[2]; // super-block scales/mins uint8_t scales[2]; // 4-bit block scales/mins uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); +static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding"); #else typedef struct { half2 dm; // super-block scale for quantized scales/mins @@ -737,8 +737,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float const int tid = threadIdx.x; const uint8_t * q = x[i].qs; float * y = yy + i*QK_K; - const float d = (float)x[i].d[0]; - const float m = (float)x[i].d[1]; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); #endif @@ -1155,8 +1155,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const uint16_t * a = (const uint16_t *)x[i].scales; aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; - const float d = (float)x[i].d[0]; - const float m = (float)x[i].d[1]; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; float sum = 0.f; for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) @@ -2845,8 +2845,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; - const float dall = bq4_K->d[0]; - const float dmin = bq4_K->d[1]; + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; const float d8_1 = __low2float(bq8_1[0].ds); const float d8_2 = __low2float(bq8_1[1].ds); @@ -2929,7 +2929,11 @@ template static __device__ __forceinlin const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; +#if QK_K == 256 x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif } #pragma unroll @@ -3119,7 +3123,9 @@ template static __device__ __forceinlin const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; +#if QK_K == 256 x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif } #pragma unroll @@ -4709,6 +4715,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { +#if QK_K == 256 + int id; CUDA_CHECK(cudaGetDevice(&id)); const int compute_capability = g_compute_capabilities[id]; @@ -4740,6 +4748,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( mul_mat_q3_K<<>> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } +#endif } static void ggml_mul_mat_q4_K_q8_1_cuda( @@ -4899,8 +4908,8 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { - GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why - const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1); + GGML_ASSERT(ncols % 2 == 0); + const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); rope_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); @@ -4908,7 +4917,8 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { - const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1); + GGML_ASSERT(ncols % 2 == 0); + const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const dim3 block_nums(nrows, num_blocks_x, 1); rope_neox_f32<<>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); @@ -6328,9 +6338,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented const int mode = ((int32_t *) dst->op_params)[2]; const bool is_glm = mode & 4; + ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm } diff --git a/ggml-metal.h b/ggml-metal.h index 00202b787..fca28d37e 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -24,6 +24,7 @@ // max memory buffers that can be mapped to the device #define GGML_METAL_MAX_BUFFERS 16 +#define GGML_METAL_MAX_COMMAND_BUFFERS 32 struct ggml_tensor; struct ggml_cgraph; diff --git a/ggml-metal.m b/ggml-metal.m index 06eb3872e..e929c4b07 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -11,6 +11,7 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// TODO: temporary - reuse llama.cpp logging #ifdef GGML_METAL_NDEBUG #define metal_printf(...) #else @@ -33,12 +34,15 @@ struct ggml_metal_buffer { struct ggml_metal_context { int n_cb; - float * logits; - id device; id queue; id library; + id command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS]; + id command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS]; + + dispatch_queue_t d_queue; + int n_buffers; struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; @@ -110,16 +114,17 @@ static NSString * const msl_library_source = @"see metal.metal"; @end struct ggml_metal_context * ggml_metal_init(int n_cb) { - fprintf(stderr, "%s: allocating\n", __func__); + metal_printf("%s: allocating\n", __func__); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); - ctx->n_cb = n_cb; + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; ctx->concur_list_len = 0; + ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); #if 0 // compile from source string and show compile log @@ -128,7 +133,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -142,11 +147,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"]; NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]]; NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"]; - fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]); + metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]); NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } @@ -158,7 +163,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error]; #endif if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]); return NULL; } } @@ -170,11 +175,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ - fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ + metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ if (error) { \ - fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } @@ -226,22 +231,80 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #undef GGML_METAL_ADD_KERNEL } - fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); if (ctx->device.maxTransferRate != 0) { - fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); } else { - fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__); + metal_printf("%s: maxTransferRate = built-in GPU\n", __func__); } return ctx; } void ggml_metal_free(struct ggml_metal_context * ctx) { - fprintf(stderr, "%s: deallocating\n", __func__); + metal_printf("%s: deallocating\n", __func__); +#define GGML_METAL_DEL_KERNEL(name) \ + [ctx->function_##name release]; \ + [ctx->pipeline_##name release]; + + GGML_METAL_DEL_KERNEL(add); + GGML_METAL_DEL_KERNEL(add_row); + GGML_METAL_DEL_KERNEL(mul); + GGML_METAL_DEL_KERNEL(mul_row); + GGML_METAL_DEL_KERNEL(scale); + GGML_METAL_DEL_KERNEL(silu); + GGML_METAL_DEL_KERNEL(relu); + GGML_METAL_DEL_KERNEL(gelu); + GGML_METAL_DEL_KERNEL(soft_max); + GGML_METAL_DEL_KERNEL(diag_mask_inf); + GGML_METAL_DEL_KERNEL(get_rows_f16); + GGML_METAL_DEL_KERNEL(get_rows_q4_0); + GGML_METAL_DEL_KERNEL(get_rows_q4_1); + GGML_METAL_DEL_KERNEL(get_rows_q8_0); + GGML_METAL_DEL_KERNEL(get_rows_q2_K); + GGML_METAL_DEL_KERNEL(get_rows_q3_K); + GGML_METAL_DEL_KERNEL(get_rows_q4_K); + GGML_METAL_DEL_KERNEL(get_rows_q5_K); + GGML_METAL_DEL_KERNEL(get_rows_q6_K); + GGML_METAL_DEL_KERNEL(rms_norm); + GGML_METAL_DEL_KERNEL(norm); + GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DEL_KERNEL(rope); + GGML_METAL_DEL_KERNEL(alibi_f32); + GGML_METAL_DEL_KERNEL(cpy_f32_f16); + GGML_METAL_DEL_KERNEL(cpy_f32_f32); + GGML_METAL_DEL_KERNEL(cpy_f16_f16); + +#undef GGML_METAL_DEL_KERNEL + for (int i = 0; i < ctx->n_buffers; ++i) { [ctx->buffers[i].metal release]; } + + [ctx->library release]; + [ctx->queue release]; + [ctx->device release]; + + dispatch_release(ctx->d_queue); + free(ctx); } @@ -249,7 +312,7 @@ void * ggml_metal_host_malloc(size_t n) { void * data = NULL; const int result = posix_memalign((void **) &data, getpagesize(), n); if (result != 0) { - fprintf(stderr, "%s: error: posix_memalign failed\n", __func__); + metal_printf("%s: error: posix_memalign failed\n", __func__); return NULL; } @@ -261,7 +324,7 @@ void ggml_metal_host_free(void * data) { } void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { - ctx->n_cb = n_cb; + ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS); } int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { @@ -277,7 +340,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { // Metal buffer based on the host memory pointer // static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { - //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); const int64_t tsize = ggml_nbytes(t); @@ -288,13 +351,13 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; - //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); + //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); return ctx->buffers[i].metal; } } - fprintf(stderr, "%s: error: buffer is nil\n", __func__); + metal_printf("%s: error: buffer is nil\n", __func__); return nil; } @@ -306,7 +369,7 @@ bool ggml_metal_add_buffer( size_t size, size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { - fprintf(stderr, "%s: too many buffers\n", __func__); + metal_printf("%s: too many buffers\n", __func__); return false; } @@ -316,7 +379,7 @@ bool ggml_metal_add_buffer( const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data; if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { - fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); + metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name); return false; } } @@ -337,11 +400,11 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); return false; } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); ++ctx->n_buffers; } else { @@ -361,27 +424,27 @@ bool ggml_metal_add_buffer( ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); return false; } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); if (i + size_step < size) { - fprintf(stderr, "\n"); + metal_printf("\n"); } ++ctx->n_buffers; } } - fprintf(stderr, ", (%8.2f / %8.2f)", + metal_printf(", (%8.2f / %8.2f)", ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { - fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n"); + metal_printf(", warning: current allocated size is greater than the recommended max working set size\n"); } else { - fprintf(stderr, "\n"); + metal_printf("\n"); } } @@ -391,8 +454,6 @@ bool ggml_metal_add_buffer( void ggml_metal_set_tensor( struct ggml_metal_context * ctx, struct ggml_tensor * t) { - metal_printf("%s: set input for tensor '%s'\n", __func__, t->name); - size_t offs; id id_dst = ggml_metal_get_buffer(ctx, t, &offs); @@ -402,8 +463,6 @@ void ggml_metal_set_tensor( void ggml_metal_get_tensor( struct ggml_metal_context * ctx, struct ggml_tensor * t) { - metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name); - size_t offs; id id_src = ggml_metal_get_buffer(ctx, t, &offs); @@ -498,14 +557,14 @@ void ggml_metal_graph_find_concurrency( } if (ctx->concur_list_len > GGML_MAX_CONCUR) { - fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__); + metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__); } } void ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { - metal_printf("%s: evaluating graph\n", __func__); + @autoreleasepool { // if there is ctx->concur_list, dispatch concurrently // else fallback to serial dispatch @@ -521,29 +580,25 @@ void ggml_metal_graph_compute( const int n_cb = ctx->n_cb; - NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; - for (int i = 0; i < n_cb; ++i) { - command_buffers[i] = [ctx->queue commandBuffer]; + ctx->command_buffers[i] = [ctx->queue commandBuffer]; // enqueue the command buffers in order to specify their execution order - [command_buffers[i] enqueue]; - } + [ctx->command_buffers[i] enqueue]; - // TODO: is this the best way to start threads? - dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT); + ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc]; + } for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb; - dispatch_async(queue, ^{ + dispatch_async(ctx->d_queue, ^{ size_t offs_src0 = 0; size_t offs_src1 = 0; size_t offs_dst = 0; - id command_buffer = command_buffers[cb_idx]; - - id encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; + id command_buffer = ctx->command_buffers[cb_idx]; + id encoder = ctx->command_encoders[cb_idx]; const int node_start = (cb_idx + 0) * n_nodes_per_cb; const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes); @@ -556,7 +611,7 @@ void ggml_metal_graph_compute( continue; } - metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; @@ -704,7 +759,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } break; @@ -863,7 +918,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "Asserting on type %d\n",(int)src0t); + metal_printf("Asserting on type %d\n",(int)src0t); GGML_ASSERT(false && "not implemented"); } }; @@ -1101,7 +1156,7 @@ void ggml_metal_graph_compute( } break; default: { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); + metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_ASSERT(false); } } @@ -1117,17 +1172,19 @@ void ggml_metal_graph_compute( } // wait for all threads to finish - dispatch_barrier_sync(queue, ^{}); - - [command_buffers[n_cb - 1] waitUntilCompleted]; + dispatch_barrier_sync(ctx->d_queue, ^{}); // check status of command buffers // needed to detect if the device ran out-of-memory for example (#1881) for (int i = 0; i < n_cb; i++) { - MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status]; + [ctx->command_buffers[i] waitUntilCompleted]; + + MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status]; if (status != MTLCommandBufferStatusCompleted) { - fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); + metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status); GGML_ASSERT(false); } } + + } } diff --git a/ggml.c b/ggml.c index 8cb5c404f..46ce4a581 100644 --- a/ggml.c +++ b/ggml.c @@ -123,6 +123,8 @@ typedef void * thread_ret_t; #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 #define GGML_SILU_FP16 +// #define GGML_CROSS_ENTROPY_EXP_FP16 +// #define GGML_FLASH_ATTN_EXP_FP16 #define GGML_SOFT_MAX_UNROLL 4 #define GGML_VEC_DOT_UNROLL 2 @@ -157,12 +159,6 @@ typedef void * thread_ret_t; //#define GGML_SOFT_MAX_ACCELERATE #endif -#if UINTPTR_MAX == 0xFFFFFFFF - #define GGML_MEM_ALIGN 4 -#else - #define GGML_MEM_ALIGN 16 -#endif - // // logging // @@ -192,8 +188,8 @@ typedef void * thread_ret_t; // #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) -#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) +#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) +#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) #else inline static void * ggml_aligned_malloc(size_t size) { void * aligned_memory = NULL; @@ -218,8 +214,8 @@ inline static void * ggml_aligned_malloc(size_t size) { } return aligned_memory; } -#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) -#define GGML_ALIGNED_FREE(ptr) free(ptr) +#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) +#define GGML_ALIGNED_FREE(ptr) free(ptr) #endif #define UNUSED GGML_UNUSED @@ -2436,7 +2432,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -2445,6 +2440,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q4_0 * restrict x0 = &x[i + 0]; const block_q4_0 * restrict x1 = &x[i + 1]; @@ -2623,6 +2619,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * } // Main loop + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 2; i < nb; i+=2) { _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); @@ -2706,7 +2703,6 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q4_1 * restrict x = vx; const block_q8_1 * restrict y = vy; @@ -2718,6 +2714,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * float summs = 0; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict x1 = &x[i + 1]; @@ -2832,7 +2829,6 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); assert(qk == QK5_0); const block_q5_0 * restrict x = vx; @@ -2848,6 +2844,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * uint64_t tmp0[4]; uint64_t tmp1[4]; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q5_0 * restrict x0 = &x[i]; const block_q5_0 * restrict x1 = &x[i + 1]; @@ -3072,7 +3069,6 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); assert(qk == QK5_1); const block_q5_1 * restrict x = vx; @@ -3091,6 +3087,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * uint64_t tmp0[4]; uint64_t tmp1[4]; + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q5_1 * restrict x0 = &x[i]; const block_q5_1 * restrict x1 = &x[i + 1]; @@ -3328,7 +3325,6 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * const int nb = n / qk; assert(n % qk == 0); - assert(nb % 2 == 0); const block_q8_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -3337,6 +3333,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); + GGML_ASSERT(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { const block_q8_0 * restrict x0 = &x[i + 0]; const block_q8_0 * restrict x1 = &x[i + 1]; @@ -4107,16 +4104,11 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) { } size_t ggml_nbytes(const struct ggml_tensor * tensor) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - // this should handle cases where the tensor is not contiguous in memory - // probaby just: - // - // return tensor->ne[3]*tensor->nb[3] - // - // is enough, but just in case, adding the second part - - return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type)); + size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type); + for (int i = 1; i < GGML_MAX_DIMS; ++i) { + nbytes += (tensor->ne[i] - 1)*tensor->nb[i]; + } + return nbytes; } size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { @@ -4570,36 +4562,51 @@ static struct ggml_tensor * ggml_new_tensor_impl( enum ggml_type type, int n_dims, const int64_t * ne, - void * data) { + struct ggml_tensor * view_src, + size_t view_offs) { assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); - size_t data_size = 0; + // find the base tensor and absolute offset + if (view_src != NULL && view_src->view_src != NULL) { + view_offs += view_src->view_offs; + view_src = view_src->view_src; + } - if (data == NULL && !ctx->no_alloc) { - data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); - for (int i = 1; i < n_dims; i++) { - data_size *= ne[i]; + size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); + for (int i = 1; i < n_dims; i++) { + data_size *= ne[i]; + } + + GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src)); + + void * data = view_src != NULL ? view_src->data : NULL; + if (data != NULL) { + data = (char *) data + view_offs; + } + + size_t obj_alloc_size = 0; + + if (view_src == NULL && ctx->no_alloc == false) { + if (ctx->scratch.data != NULL) { + // allocate tensor data in the scratch buffer + if (ctx->scratch.offs + data_size > ctx->scratch.size) { + GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", + __func__, ctx->scratch.offs + data_size, ctx->scratch.size); + assert(false); + return NULL; + } + + data = (char * const) ctx->scratch.data + ctx->scratch.offs; + + ctx->scratch.offs += data_size; + } else { + // allocate tensor data in the context's memory pool + obj_alloc_size = data_size; } } - if (ctx->scratch.data != NULL && data == NULL) { - // allocate tensor data in the scratch buffer - if (ctx->scratch.offs + data_size > ctx->scratch.size) { - GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n", - __func__, ctx->scratch.offs + data_size, ctx->scratch.size); - assert(false); - return NULL; - } - - data = (char * const) ctx->scratch.data + ctx->scratch.offs; - - ctx->scratch.offs += data_size; - - data_size = 0; - } - - struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size); + struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here @@ -4619,7 +4626,9 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, - /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, + /*.view_src =*/ view_src, + /*.view_offs =*/ view_offs, + /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, /*.padding =*/ { 0 }, @@ -4643,28 +4652,12 @@ static struct ggml_tensor * ggml_new_tensor_impl( return result; } -static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { - GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings - assert(params_size <= GGML_MAX_OP_PARAMS); - memcpy(tensor->op_params, params, params_size); -} - -static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { - assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); - return ((const int32_t *)(tensor->op_params))[i]; -} - -static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { - assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); - ((int32_t *)(tensor->op_params))[i] = value; -} - struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, int n_dims, const int64_t * ne) { - return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); + return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0); } struct ggml_tensor * ggml_new_tensor_1d( @@ -4729,7 +4722,23 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { } struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL); + return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne); +} + +static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings + assert(params_size <= GGML_MAX_OP_PARAMS); + memcpy(tensor->op_params, params, params_size); +} + +static int32_t ggml_get_op_params_i32(const struct ggml_tensor * tensor, uint32_t i) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + return ((const int32_t *)(tensor->op_params))[i]; +} + +static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int32_t value) { + assert(i < GGML_MAX_OP_PARAMS / sizeof(int32_t)); + ((int32_t *)(tensor->op_params))[i] = value; } struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) { @@ -5015,14 +5024,13 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, - const struct ggml_tensor * src) { - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + struct ggml_tensor * src) { + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0); ggml_format_name(result, "%s (view)", src->name); - result->nb[0] = src->nb[0]; - result->nb[1] = src->nb[1]; - result->nb[2] = src->nb[2]; - result->nb[3] = src->nb[3]; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + result->nb[i] = src->nb[i]; + } return result; } @@ -5595,7 +5603,7 @@ struct ggml_tensor * ggml_repeat_back( // ggml_concat -struct ggml_tensor* ggml_concat( +struct ggml_tensor * ggml_concat( struct ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b) { @@ -5862,7 +5870,8 @@ struct ggml_tensor * ggml_rms_norm_inplace( struct ggml_tensor * ggml_rms_norm_back( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b) { + struct ggml_tensor * b, + float eps) { bool is_node = false; if (a->grad) { @@ -5872,6 +5881,8 @@ struct ggml_tensor * ggml_rms_norm_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); + ggml_set_op_params(result, &eps, sizeof(eps)); + result->op = GGML_OP_RMS_NORM_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; @@ -6201,7 +6212,7 @@ struct ggml_tensor * ggml_reshape( //GGML_ASSERT(false); } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6225,7 +6236,7 @@ struct ggml_tensor * ggml_reshape_1d( } const int64_t ne[1] = { ne0 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6250,7 +6261,7 @@ struct ggml_tensor * ggml_reshape_2d( } const int64_t ne[2] = { ne0, ne1 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6276,7 +6287,7 @@ struct ggml_tensor * ggml_reshape_3d( } const int64_t ne[3] = { ne0, ne1, ne2 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6286,7 +6297,6 @@ struct ggml_tensor * ggml_reshape_3d( return result; } - struct ggml_tensor * ggml_reshape_4d( struct ggml_context * ctx, struct ggml_tensor * a, @@ -6304,7 +6314,7 @@ struct ggml_tensor * ggml_reshape_4d( } const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -6314,34 +6324,12 @@ struct ggml_tensor * ggml_reshape_4d( return result; } -// ggml_view_1d - -static struct ggml_tensor * ggml_view_tensor_offset( +static struct ggml_tensor * ggml_view_impl( struct ggml_context * ctx, struct ggml_tensor * a, int n_dims, const int64_t * ne, size_t offset) { - // don't calculate an offset from an unallocated tensor - void * data = NULL; - if (a->data != NULL) { - data = (char *) a->data + offset; - } - - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data); - - ggml_format_name(result, "%s (view)", a->name); - - ggml_set_op_params(result, &offset, sizeof(offset)); - - return result; -} - -struct ggml_tensor * ggml_view_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - int64_t ne0, - size_t offset) { bool is_node = false; @@ -6349,7 +6337,10 @@ struct ggml_tensor * ggml_view_1d( is_node = true; } - struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset); + ggml_format_name(result, "%s (view)", a->name); + + ggml_set_op_params(result, &offset, sizeof(offset)); result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6358,6 +6349,19 @@ struct ggml_tensor * ggml_view_1d( return result; } +// ggml_view_1d + +struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + size_t offset) { + + struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset); + + return result; +} + // ggml_view_2d struct ggml_tensor * ggml_view_2d( @@ -6368,24 +6372,14 @@ struct ggml_tensor * ggml_view_2d( size_t nb1, size_t offset) { - bool is_node = false; + const int64_t ne[2] = { ne0, ne1 }; - if (a->grad) { - is_node = true; - } - - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; - - struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset); result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; result->nb[3] = result->nb[2]; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - return result; } @@ -6401,24 +6395,14 @@ struct ggml_tensor * ggml_view_3d( size_t nb2, size_t offset) { - bool is_node = false; + const int64_t ne[3] = { ne0, ne1, ne2 }; - if (a->grad) { - is_node = true; - } - - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; - - struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset); result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = result->nb[2]*ne2; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - return result; } @@ -6436,24 +6420,14 @@ struct ggml_tensor * ggml_view_4d( size_t nb3, size_t offset) { - bool is_node = false; + const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; - if (a->grad) { - is_node = true; - } - - const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; - - struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset); + struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset); result->nb[1] = nb1; result->nb[2] = nb2; result->nb[3] = nb3; - result->op = GGML_OP_VIEW; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = a; - return result; } @@ -6640,7 +6614,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[] = { n_past, inplace ? 1 : 0 }; + int32_t params[] = { n_past }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_INF; @@ -6657,7 +6631,6 @@ struct ggml_tensor * ggml_diag_mask_inf( return ggml_diag_mask_inf_impl(ctx, a, n_past, false); } - struct ggml_tensor * ggml_diag_mask_inf_inplace( struct ggml_context * ctx, struct ggml_tensor * a, @@ -6680,7 +6653,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - int32_t params[] = { n_past, inplace ? 1 : 0 }; + int32_t params[] = { n_past }; ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_DIAG_MASK_ZERO; @@ -7097,11 +7070,13 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + ggml_set_op_params_i32(result, 0, stride); + result->op = GGML_OP_CONV_TRANSPOSE_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = ggml_new_i32(ctx, stride); return result; } @@ -9446,6 +9421,8 @@ static void ggml_compute_forward_div_f32( #ifdef GGML_USE_ACCELERATE + UNUSED(ggml_vec_div_f32); + vDSP_vdiv( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1, @@ -10752,7 +10729,8 @@ static void ggml_compute_forward_rms_norm_back_f32( GGML_TENSOR_BINARY_OP_LOCALS; - const float eps = 1e-6f; // TODO: make this a parameter + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); // TODO: optimize for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -11930,8 +11908,8 @@ static void ggml_compute_forward_diag_mask_f32( const int ith = params->ith; const int nth = params->nth; - const int n_past = ((int32_t *) dst->op_params)[0]; - const bool inplace = (bool)((int32_t *) dst->op_params)[1]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const bool inplace = src0->data == dst->data; GGML_ASSERT(n_past >= 0); @@ -12142,6 +12120,7 @@ static void ggml_compute_forward_soft_max_back_f32( // dx = J * dy // dxk = sum_i(Jki * dyi) // dxk = sum_i(-yk*yi * dyi) - (-yk*yk)*dyk + (yk - yk*yk)*dyk + // dxk = sum_i(-yk*yi * dyi) + yk*yk*dyk + yk*dyk - yk*yk*dyk // dxk = sum_i(-yk*yi * dyi) + yk*dyk // dxk = -yk * sum_i(yi * dyi) + yk*dyk // dxk = -yk * dot(y, dy) + yk*dyk @@ -13497,7 +13476,6 @@ static void ggml_compute_forward_conv_transpose_2d( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13557,7 +13535,7 @@ static void ggml_compute_forward_conv_transpose_2d( return; } - const int32_t stride = ((const int32_t*)(opt0->data))[0]; + const int32_t stride = ggml_get_op_params_i32(dst, 0); // total patches in dst const int np = ne2; @@ -13570,7 +13548,7 @@ static void ggml_compute_forward_conv_transpose_2d( const int ip1 = MIN(ip0 + dp, np); ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - ggml_fp16_t * const wdata_src = (ggml_fp16_t *) params->wdata + nk; + ggml_fp16_t * const wdata_src = wdata + nk; for (int i2 = ip0; i2 < ip1; i2++) { // Cout float * dst_data = (float *)((char *) dst->data + i2*nb2); @@ -13582,9 +13560,8 @@ static void ggml_compute_forward_conv_transpose_2d( for (int i00 = 0; i00 < ne00; i00++) { float v = 0; ggml_vec_dot_f16(ne03, &v, - (ggml_fp16_t *) wdata_src + i1n, - (ggml_fp16_t *) wdata_kernel + i01*ne00*ne03 + i00*ne03); - + wdata_src + i1n, + wdata_kernel + i01*ne00*ne03 + i00*ne03); dst_data[(i11*stride + i01)*ne0 + i10*stride + i00] += v; } } @@ -13934,7 +13911,7 @@ static void ggml_compute_forward_flash_attn_f32( vvexpf(S, S, &Mup); ggml_vec_sum_f32(Mup, &sum, S); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { @@ -13944,9 +13921,13 @@ static void ggml_compute_forward_flash_attn_f32( if (SS[j] == -INFINITY) { SS[j] = 0.0f; } else { +#ifndef GGML_FLASH_ATTN_EXP_FP16 + const float val = expf(SS[j] - max); +#else ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); +#endif sump[j] += (ggml_float)val; SS[j] = val; } @@ -14524,7 +14505,7 @@ static void ggml_compute_forward_flash_attn_back_f32( vvexpf(SM, SM, &Mup); ggml_vec_sum_f32(Mup, &sum, SM); #else - uint16_t scvt[GGML_SOFT_MAX_UNROLL]; + uint16_t scvt[GGML_SOFT_MAX_UNROLL]; UNUSED(scvt); ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 }; for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) { @@ -14535,9 +14516,13 @@ static void ggml_compute_forward_flash_attn_back_f32( if (SR[j] == -INFINITY) { SW[j] = 0.0f; } else { +#ifndef GGML_FLASH_ATTN_EXP_FP16 + const float val = expf(SR[j] - max); +#else ggml_fp16_t s = GGML_FP32_TO_FP16(SR[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); +#endif sump[j] += (ggml_float)val; SW[j] = val; } @@ -15275,6 +15260,8 @@ static void ggml_compute_forward_cross_entropy_loss_f32( const int nc = src0->ne[0]; const int nr = ggml_nrows(src0); + GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); + if (params->type == GGML_TASK_INIT) { if (ith == 0) { memset(sums, 0, sizeof(float) * (nth + nth * nc)); @@ -15286,7 +15273,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( if (ith == 0) { float * dp = (float *) dst->data; ggml_vec_sum_f32(nth, dp, sums); - dp[0] *= -1.0f; + dp[0] *= -1.0f / (float) nr; } return; } @@ -15303,7 +15290,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( for (int i1 = ir0; i1 < ir1; i1++) { float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - float * st = (float *) params->wdata + nth + ith*nc; + float * st = ((float *) params->wdata) + nth + ith*nc; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -15318,15 +15305,19 @@ static void ggml_compute_forward_cross_entropy_loss_f32( float max = -INFINITY; ggml_vec_max_f32(nc, &max, s0); - uint16_t scvt; + uint16_t scvt; UNUSED(scvt); for (int i = 0; i < nc; i++) { if (s0[i] == -INFINITY) { st[i] = 0.0f; } else { - // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); +#ifndef GGML_CROSS_ENTROPY_EXP_FP16 + const float s = s0[i] - max; + const float val = expf(s); +#else ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); +#endif sum += (ggml_float)val; st[i] = val; } @@ -15342,7 +15333,9 @@ static void ggml_compute_forward_cross_entropy_loss_f32( ggml_vec_log_f32(nc, st, st); ggml_vec_mul_f32(nc, st, st, s1); - ggml_vec_sum_f32(nc, sums + ith, st); + float st_sum = 0; + ggml_vec_sum_f32(nc, &st_sum, st); + sums[ith] += st_sum; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -15392,7 +15385,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( return; } - const float eps = 1e-9f; + const double eps = 1e-9; // TODO: handle transposed/permuted matrices const int64_t nc = src0->ne[0]; @@ -15411,7 +15404,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]); float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]); float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]); - float * sm = (float *) params->wdata + ith*nc; #ifndef NDEBUG for (int i = 0; i < nc; ++i) { @@ -15420,54 +15412,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( assert(!isnan(s1[i])); } #endif - // step by step explanation: - { - //float * sums = (float *) params->wdata; - - // forward pass with annotated gradients from backward pass - // (built by going in reverse operation order, adding to gradients of current operation args) - // st0 = exp(s0-max(s0)) grad[st0] = grad[st1]*(1.0 - eps)/sum - // from softmax_back: grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) - // ggml_vec_scale_f32(nc, st, sum); // st1 = st0*/sum = softmax(s0) grad[st1] = grad[st2]*(1.0 - eps) - // ggml_vec_scale_f32(nc, st, (1.0f - eps)); // st2 = st1*(1.0 - eps) grad[st2] = grad[st3] - // ggml_vec_add1_f32(nc, st, st, eps); // st3 = st2 + eps grad[st3] = grad[st4]/st3 - // ggml_vec_log_f32(nc, st, st); // st4 = log(st3) grad[st4] = grad[st5] * s1 - // ggml_vec_mul_f32(nc, st, st, s1); // st5 = st4 * s1 grad[st5] = grad[sums[ith]] - // ggml_vec_sum_f32(nc, sums + ith, st); // sums[ith] = st5 grad[sums[ith]] = grad[cross_entropy_loss] = -grad[cel] - - // substitute into grad[st1], because we can reuse softmax_back from this point on - // grad[st1] = -grad[cel]*s1*(1.0 - eps)/(eps + softmax(s0)*(1.0 - eps)) - // postorder: - // grad[st1] := softmax(s0) - // grad[st1] := grad[st1]*(1.0 - eps) - // grad[st1] := grad[st1] + eps - // grad[st1] := s1 / grad[st1] - // grad[st1] := grad[st1]*(1.0-eps)*-grad[cel] - - // src0 gradients by going through softmax_back - // grad[s0] = st1_k * (grad[st1]_k - dot(st1, grad[st1])) - // from softmax_back: - // dxk = yk * (dyk - dot(y, dy)) - // dot_y_dy := dot(y, dy) - // dx := dy - // dx := dx - dot_y_dy - // dx := dx * y - // postorder: - // dot_st1_dst1 := dot(st1, grad[st1]) - // grad[s0] := grad[st1] - // grad[s0] := grad[s0] - dot_st1_dst1 - // grad[s0] := grad[s0] * st1 - - // prepend postorder from grad[st1] directly using grad[s0] as memory location, as we will grad[s0] := grad[st1] - // sm := softmax(s0) - // grad[s0] := sm*(1.0 - eps) - // grad[s0] := grad[s0] + eps - // grad[s0] := s1 / grad[s0] - // grad[s0] := grad[s0]*(1.0-eps)*-grad[cel] - // dot_st1_dst1 := dot(sm, grad[s0]) - // grad[s0] := grad[s0] - dot_st1_dst1 - // grad[s0] := grad[s0] * sm - } // soft_max ggml_float sum = 0.0; @@ -15475,39 +15419,37 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( float max = -INFINITY; ggml_vec_max_f32(nc, &max, s0); - uint16_t scvt; + uint16_t scvt; UNUSED(scvt); for (int i = 0; i < nc; i++) { if (s0[i] == -INFINITY) { - sm[i] = 0.0f; + ds0[i] = 0.0f; } else { - // const float val = (s0[i] == -INFINITY) ? 0.0 : exp(s0[i] - max); +#ifndef GGML_CROSS_ENTROPY_EXP_FP16 + const float s = s0[i] - max; + const float val = expf(s); +#else ggml_fp16_t s = GGML_FP32_TO_FP16(s0[i] - max); memcpy(&scvt, &s, sizeof(scvt)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); +#endif sum += (ggml_float)val; - sm[i] = val; + ds0[i] = val; } } assert(sum > 0.0); - sum = 1.0/sum; + sum = (1.0 - eps)/sum; } - float dot_st1_dst1 = 0; - ggml_vec_scale_f32(nc, sm, sum); - ggml_vec_cpy_f32 (nc, ds0, sm); - ggml_vec_scale_f32(nc, ds0, (1.0f - eps)); - ggml_vec_add1_f32 (nc, ds0, ds0, eps); - ggml_vec_div_f32 (nc, ds0, s1, ds0); - ggml_vec_scale_f32(nc, ds0, -(1.0f - eps)*d[0]); - ggml_vec_dot_f32 (nc, &dot_st1_dst1, sm, ds0); - ggml_vec_acc1_f32 (nc, ds0, -dot_st1_dst1); - ggml_vec_mul_f32 (nc, ds0, ds0, sm); + // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr + ggml_vec_scale_f32(nc, ds0, sum); + ggml_vec_add1_f32(nc, ds0, ds0, eps); + ggml_vec_sub_f32(nc, ds0, ds0, s1); + ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); + #ifndef NDEBUG for (int i = 0; i < nc; ++i) { - assert(!isnan(sm[i])); - assert(!isinf(sm[i])); assert(!isnan(ds0[i])); assert(!isinf(ds0[i])); } @@ -15731,7 +15673,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_CONV_TRANSPOSE_2D: { - ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_POOL_1D: { @@ -16062,9 +16004,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { + float eps; + memcpy(&eps, tensor->op_params, sizeof(float)); + src0->grad = ggml_add_impl(ctx, src0->grad, - ggml_rms_norm_back(ctx, src0, tensor->grad), + ggml_rms_norm_back(ctx, src0, tensor->grad, eps), inplace); } } break; @@ -16832,9 +16777,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { return result; } -struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { - struct ggml_cgraph result = *gf; - +void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep) { GGML_ASSERT(gf->n_nodes > 0); // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph @@ -16858,15 +16801,19 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg } } - for (int i = gf->n_nodes - 1; i >= 0; i--) { + for (int i = 0; i < gf->n_nodes; i++) { struct ggml_tensor * node = gf->nodes[i]; if (node->is_param) { GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); - ggml_build_forward_expand(&result, node->grad); + ggml_build_forward_expand(gb, node->grad); } } +} +struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) { + struct ggml_cgraph result = *gf; + ggml_build_backward_expand(ctx, gf, &result, keep); return result; } @@ -17542,10 +17489,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { n_tasks = n_threads; - - size_t cur = ggml_type_size(node->type)*node->src[0]->ne[0]*n_tasks; - - work_size = MAX(work_size, cur); } break; case GGML_OP_NONE: { @@ -18423,14 +18366,16 @@ static enum ggml_opt_result ggml_opt_adam( struct ggml_opt_params params, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb) { + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { GGML_ASSERT(ggml_is_scalar(f)); // these will store the parameters we want to optimize struct ggml_tensor * ps[GGML_MAX_PARAMS]; int np = 0; - int nx = 0; + int64_t nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { if (gf->nodes[i]->is_param) { GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); @@ -18449,31 +18394,32 @@ static enum ggml_opt_result ggml_opt_adam( } // constants - const float sched = params.adam.sched; - const float decay = params.adam.decay * sched; - const float alpha = params.adam.alpha * sched; + float sched = params.adam.sched; + const float alpha = params.adam.alpha; + const float decay = params.adam.decay * alpha; const float beta1 = params.adam.beta1; const float beta2 = params.adam.beta2; const float eps = params.adam.eps; + const float gclip = params.adam.gclip; + const int decay_min_ndim = params.adam.decay_min_ndim; - float * x = opt->adam.x->data; // view of the parameters - float * g1 = opt->adam.g1->data; // gradient - float * g2 = opt->adam.g2->data; // gradient squared float * m = opt->adam.m->data; // first moment float * v = opt->adam.v->data; // second moment - float * mh = opt->adam.mh->data; // first moment hat - float * vh = opt->adam.vh->data; // second moment hat float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values - // update view - ggml_opt_get_params(np, ps, x); + if (callback) { + callback(callback_data, &sched); + } // compute the function value ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + ggml_graph_compute(gb, &cplan); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -18481,6 +18427,9 @@ static enum ggml_opt_result ggml_opt_adam( pf[opt->iter % params.past] = opt->adam.fx_prev; } + opt->loss_before = opt->adam.fx_prev; + opt->loss_after = opt->adam.fx_prev; + // initialize if (opt->just_initialized) { opt->adam.n_no_improvement = 0; @@ -18513,50 +18462,55 @@ static enum ggml_opt_result ggml_opt_adam( UNUSED(t_start_cpu); { - // update the gradient - ggml_opt_get_grad(np, ps, g1); + float gnorm = 1.0f; + if (gclip > 0.0f) { + // gradient clipping + ggml_float sum = 0.0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]); + for (int64_t j = 0; j < ne; ++j) { + float g = ggml_get_f32_1d(ps[p]->grad, j); + sum += (ggml_float)(g*g); + } + } + ggml_float norm = sqrt(sum); + if (norm > (ggml_float) gclip) { + gnorm = (float) ((ggml_float) gclip / norm); + } + } + const float beta1h = alpha*sched/(1.0f - powf(beta1, opt->iter)); + const float beta2h = 1.0f/(1.0f - powf(beta2, opt->iter)); + int64_t i = 0; + for (int p = 0; p < np; ++p) { + const int64_t ne = ggml_nelements(ps[p]); + const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; + for (int64_t j = 0; j < ne; ++j) { + float x = ggml_get_f32_1d(ps[p], j); + float g = ggml_get_f32_1d(ps[p]->grad, j)*gnorm; + m[i] = m[i]*beta1 + g*(1.0f - beta1); + v[i] = v[i]*beta2 + g*g*(1.0f - beta2); + float mh = m[i]*beta1h; + float vh = v[i]*beta2h; + vh = sqrtf(vh) + eps; + x = x*(1.0f - p_decay) - mh/vh; + ggml_set_f32_1d(ps[p], j, x); + ++i; + } + } + } - // m_t = beta1*m_t-1 + (1 - beta1)*g_t - ggml_vec_scale_f32(nx, m, beta1); - ggml_vec_mad_f32 (nx, m, g1, 1.0f - beta1); - - // g2 = g1^2 - ggml_vec_sqr_f32 (nx, g2, g1); - - // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2 - ggml_vec_scale_f32(nx, v, beta2); - ggml_vec_mad_f32 (nx, v, g2, 1.0f - beta2); - - // m^hat = m_t / (1 - beta1^t) - // v^hat = v_t / (1 - beta2^t) - // x_t = x_t-1 - sched*(alpha*m^hat/(sqrt(v^hat) + eps) + decay*x_t-1) - // x_t = x_t-1 - sched*alpha*m^hat/(sqrt(v^hat) + eps) - sched*decay*x_t-1 - // x_t = x_t-1*(1-sched*decay) - sched*alpha*m^hat/(sqrt(v^hat) + eps) - // x_t = x_t-1*(1-sched*decay) + sched*decay*(-alpha/decay)*m^hat/(sqrt(v^hat) + eps) - // x_t = mix(x_t-1, (-alpha/decay)*m^hat/(sqrt(v^hat) + eps), sched*decay) - ggml_vec_cpy_f32 (nx, mh, m); - ggml_vec_cpy_f32 (nx, vh, v); - - ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, opt->iter))); - ggml_vec_scale_f32(nx, vh, 1.0f/(1.0f - powf(beta2, opt->iter))); - - ggml_vec_sqrt_f32 (nx, vh, vh); - ggml_vec_acc1_f32 (nx, vh, eps); - - ggml_vec_div_f32 (nx, mh, mh, vh); - ggml_vec_scale_f32(nx, x, 1.0f - decay); - ggml_vec_sub_f32 (nx, x, x, mh); - - // update the parameters - ggml_opt_set_params(np, ps, x); + if (callback) { + callback(callback_data, &sched); } ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); + ggml_graph_compute(gb, &cplan); const float fx = ggml_get_f32_1d(f, 0); + opt->loss_after = fx; + // check convergence if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { @@ -18625,7 +18579,6 @@ struct ggml_lbfgs_iteration_data { }; static enum ggml_opt_result linesearch_backtracking( - struct ggml_context * ctx, const struct ggml_opt_params * params, int nx, float * x, @@ -18637,8 +18590,11 @@ static enum ggml_opt_result linesearch_backtracking( struct ggml_tensor * f, struct ggml_cgraph * gf, struct ggml_cgraph * gb, + struct ggml_cplan * cplan, const int np, - struct ggml_tensor * ps[]) { + struct ggml_tensor * ps[], + ggml_opt_callback callback, + void * callback_data) { int count = 0; float width = 0.0f; @@ -18667,6 +18623,12 @@ static enum ggml_opt_result linesearch_backtracking( dgtest = params->lbfgs.ftol*dginit; while (true) { + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, &sched); + } + ggml_vec_cpy_f32(nx, x, xp); ggml_vec_mad_f32(nx, x, d, *step); @@ -18677,7 +18639,7 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx, gb, params->n_threads); + ggml_graph_compute(gb, cplan); ggml_opt_get_grad(np, ps, g); @@ -18737,7 +18699,9 @@ static enum ggml_opt_result ggml_opt_lbfgs( struct ggml_opt_params params, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb) { + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { @@ -18769,6 +18733,10 @@ static enum ggml_opt_result ggml_opt_lbfgs( opt->iter = iter; } + struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; + float * x = opt->lbfgs.x->data; // current parameters float * xp = opt->lbfgs.xp->data; // previous parameters float * g = opt->lbfgs.g->data; // current gradient @@ -18790,6 +18758,12 @@ static enum ggml_opt_result ggml_opt_lbfgs( float * lm_s = opt->lbfgs.lms->data; float * lm_y = opt->lbfgs.lmy->data; + if (callback) { + // LBFG-S does not support learning rate -> ignore learning schedule + float sched = 0; + callback(callback_data, &sched); + } + // evaluate the function value and its gradient { ggml_opt_set_params(np, ps, x); @@ -18797,11 +18771,14 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); + ggml_graph_compute(gb, &cplan); ggml_opt_get_grad(np, ps, g); fx = ggml_get_f32_1d(f, 0); + + opt->loss_before = fx; + opt->loss_after = fx; } // search direction = -gradient @@ -18856,7 +18833,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_vec_cpy_f32(nx, xp, x); ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, &cplan, np, ps, callback, callback_data); if (ls < 0) { // linesearch failed - go back to the previous point and return @@ -18866,6 +18843,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( return ls; } + opt->loss_after = fx; + ggml_vec_norm_f32(nx, &xnorm, x); ggml_vec_norm_f32(nx, &gnorm, g); @@ -18923,7 +18902,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // ys = y^t \cdot s -> 1 / \rho. // yy = y^t \cdot y. // - ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0] *nx]); + ggml_vec_dot_f32(nx, &ys, &lm_y[end[0]*nx], &lm_s[end[0]*nx]); ggml_vec_dot_f32(nx, &yy, &lm_y[end[0]*nx], &lm_y[end[0]*nx]); lm_ys[end[0]] = ys; @@ -18986,13 +18965,15 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { .adam = { .n_iter = 10000, .sched = 1.000f, - .decay = 0.001f, + .decay = 0.0f, + .decay_min_ndim = 2, .alpha = 0.001f, .beta1 = 0.9f, .beta2 = 0.999f, .eps = 1e-8f, .eps_f = 1e-5f, .eps_g = 1e-3f, + .gclip = 0.0f, }, }; } break; @@ -19042,23 +19023,13 @@ GGML_API void ggml_opt_init( switch (opt->params.type) { case GGML_OPT_ADAM: { - opt->adam.x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); opt->adam.m = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); opt->adam.v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); - opt->adam.vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx); opt->adam.pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past) : NULL; - ggml_set_zero(opt->adam.x); - ggml_set_zero(opt->adam.g1); - ggml_set_zero(opt->adam.g2); ggml_set_zero(opt->adam.m); ggml_set_zero(opt->adam.v); - ggml_set_zero(opt->adam.mh); - ggml_set_zero(opt->adam.vh); if (opt->adam.pf) { ggml_set_zero(opt->adam.pf); } @@ -19142,7 +19113,7 @@ enum ggml_opt_result ggml_opt_resume( *gf = ggml_build_forward (f); *gb = ggml_build_backward(ctx, gf, true); - return ggml_opt_resume_g(ctx, opt, f, gf, gb); + return ggml_opt_resume_g(ctx, opt, f, gf, gb, NULL, NULL); } enum ggml_opt_result ggml_opt_resume_g( @@ -19150,7 +19121,9 @@ enum ggml_opt_result ggml_opt_resume_g( struct ggml_opt_context * opt, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb) { + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data) { // build forward + backward compute graphs enum ggml_opt_result result = GGML_OPT_OK; @@ -19158,11 +19131,11 @@ enum ggml_opt_result ggml_opt_resume_g( switch (opt->params.type) { case GGML_OPT_ADAM: { - result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb); + result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; case GGML_OPT_LBFGS: { - result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb); + result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; } @@ -19394,7 +19367,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i //////////////////////////////////////////////////////////////////////////////// struct gguf_str { - uint32_t n; + uint64_t n; // GGUFv2 char * data; }; @@ -19408,9 +19381,12 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { [GGUF_TYPE_FLOAT32] = sizeof(float), [GGUF_TYPE_BOOL] = sizeof(bool), [GGUF_TYPE_STRING] = sizeof(struct gguf_str), + [GGUF_TYPE_UINT64] = sizeof(uint64_t), + [GGUF_TYPE_INT64] = sizeof(int64_t), + [GGUF_TYPE_FLOAT64] = sizeof(double), [GGUF_TYPE_ARRAY] = 0, // undefined }; -static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { [GGUF_TYPE_UINT8] = "u8", @@ -19423,8 +19399,11 @@ static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { [GGUF_TYPE_BOOL] = "bool", [GGUF_TYPE_STRING] = "str", [GGUF_TYPE_ARRAY] = "arr", + [GGUF_TYPE_UINT64] = "u64", + [GGUF_TYPE_INT64] = "i64", + [GGUF_TYPE_FLOAT64] = "f64", }; -static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); +static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); union gguf_value { uint8_t uint8; @@ -19434,6 +19413,9 @@ union gguf_value { uint32_t uint32; int32_t int32; float float32; + uint64_t uint64; + int64_t int64; + double float64; bool bool_; struct gguf_str str; @@ -19441,7 +19423,7 @@ union gguf_value { struct { enum gguf_type type; - uint32_t n; + uint64_t n; // GGUFv2 void * data; } arr; }; @@ -19449,8 +19431,6 @@ union gguf_value { struct gguf_kv { struct gguf_str key; - uint32_t n_bytes; // TODO: is this actually needed? - enum gguf_type type; union gguf_value value; }; @@ -19458,15 +19438,15 @@ struct gguf_kv { struct gguf_header { uint32_t magic; uint32_t version; - uint32_t n_tensors; - uint32_t n_kv; + uint64_t n_tensors; // GGUFv2 + uint64_t n_kv; // GGUFv2 }; struct gguf_tensor_info { struct gguf_str name; uint32_t n_dims; - uint32_t ne[GGML_MAX_DIMS]; + uint64_t ne[GGML_MAX_DIMS]; enum ggml_type type; @@ -19497,19 +19477,32 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) return n == size; } -static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { +// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 +static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) { p->n = 0; p->data = NULL; bool ok = true; - // TODO: how to avoid mallocs for strings? ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); ok = ok && gguf_fread_el(file, p->data, p->n, offset); return ok; } +static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n; + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; +} + struct gguf_context * gguf_init_empty(void) { struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); @@ -19565,8 +19558,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ctx->data = NULL; ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n_tensors = 0; + uint32_t n_kv = 0; + + ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset); + ok = ok && gguf_fread_el(file, &n_kv, sizeof(n_kv), &offset); + + ctx->header.n_tensors = n_tensors; + ctx->header.n_kv = n_kv; + } else { + ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + } if (!ok) { fprintf(stderr, "%s: failed to read header\n", __func__); @@ -19576,18 +19582,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } } + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur; + if (ctx->header.version == 1) { + gguf_fread_str = gguf_fread_str_v1; + } + // read the kv pairs { - ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); + ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; //fprintf(stderr, "%s: reading kv %d\n", __func__, i); - ok = ok && gguf_fread_str(file, &kv->key, &offset); - //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset); - ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); + ok = ok && gguf_fread_str(file, &kv->key, &offset); + ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); @@ -19599,12 +19610,23 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break; case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break; + case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break; + case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break; + case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break; case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break; case GGUF_TYPE_ARRAY: { ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t n = 0; + ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset); + kv->value.arr.n = n; + } else { + ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + } switch (kv->value.arr.type) { case GGUF_TYPE_UINT8: @@ -19614,6 +19636,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: case GGUF_TYPE_BOOL: { kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); @@ -19648,7 +19673,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the tensor infos { - ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; @@ -19660,7 +19685,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_str(file, &info->name, &offset); ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); for (uint32_t j = 0; j < info->n_dims; ++j) { - ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + if (ctx->header.version == 1) { + // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023 + uint32_t t = 0; + ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset); + info->ne[j] = t; + } else { + ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + } } ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); @@ -19842,7 +19874,7 @@ void gguf_free(struct gguf_context * ctx) { } } - GGML_ALIGNED_FREE(ctx->kv); + free(ctx->kv); } if (ctx->infos) { @@ -19854,7 +19886,7 @@ void gguf_free(struct gguf_context * ctx) { } } - GGML_ALIGNED_FREE(ctx->infos); + free(ctx->infos); } GGML_ALIGNED_FREE(ctx); @@ -19954,6 +19986,18 @@ float gguf_get_val_f32(struct gguf_context * ctx, int i) { return ctx->kv[i].value.float32; } +uint64_t gguf_get_val_u64(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint64; +} + +int64_t gguf_get_val_i64(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int64; +} + +double gguf_get_val_f64(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float64; +} + bool gguf_get_val_bool(struct gguf_context * ctx, int i) { return ctx->kv[i].value.bool_; } @@ -20000,7 +20044,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { const int n_kv = gguf_get_n_kv(ctx); ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); - ctx->kv[n_kv].key.n = strlen(key) + 1; + ctx->kv[n_kv].key.n = strlen(key); ctx->kv[n_kv].key.data = strdup(key); ctx->header.n_kv++; @@ -20056,6 +20100,27 @@ void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) { ctx->kv[idx].value.float32 = val; } +void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT64; + ctx->kv[idx].value.uint64 = val; +} + +void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT64; + ctx->kv[idx].value.int64 = val; +} + +void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT64; + ctx->kv[idx].value.float64 = val; +} + void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) { const int idx = gguf_get_or_add_key(ctx, key); @@ -20067,7 +20132,7 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * const int idx = gguf_get_or_add_key(ctx, key); ctx->kv[idx].type = GGUF_TYPE_STRING; - ctx->kv[idx].value.str.n = strlen(val) + 1; + ctx->kv[idx].value.str.n = strlen(val); ctx->kv[idx].value.str.data = strdup(val); } @@ -20090,7 +20155,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str)); for (int i = 0; i < n; i++) { struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; - str->n = strlen(data[i]) + 1; + str->n = strlen(data[i]); str->data = strdup(data[i]); } } @@ -20106,6 +20171,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break; case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break; case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break; + case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break; + case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break; + case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break; case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break; case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break; case GGUF_TYPE_ARRAY: @@ -20134,7 +20202,7 @@ void gguf_add_tensor( const int idx = ctx->header.n_tensors; ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); - ctx->infos[idx].name.n = strlen(tensor->name) + 1; + ctx->infos[idx].name.n = strlen(tensor->name); ctx->infos[idx].name.data = strdup(tensor->name); for (int i = 0; i < GGML_MAX_DIMS; ++i) { @@ -20267,6 +20335,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break; case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break; case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break; + case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break; + case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break; + case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break; case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break; case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break; case GGUF_TYPE_ARRAY: @@ -20282,6 +20353,9 @@ static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: case GGUF_TYPE_BOOL: { gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); @@ -20516,6 +20590,14 @@ int ggml_cpu_has_sse3(void) { #endif } +int ggml_cpu_has_ssse3(void) { +#if defined(__SSSE3__) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_vsx(void) { #if defined(__POWER9_VECTOR__) return 1; diff --git a/ggml.h b/ggml.h index 421c0df60..c936823d6 100644 --- a/ggml.h +++ b/ggml.h @@ -130,13 +130,16 @@ // The data of the tensor is accessed via the "data" pointer. For example: // // { -// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); +// const int nx = 2; +// const int ny = 3; // -// // a[2, 1] = 1.0f; -// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; +// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); // -// // a[0, 2] = 2.0f; -// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y; +// } +// } // // ... // } @@ -211,12 +214,17 @@ #define GGML_MAX_OP_PARAMS 32 #define GGML_DEFAULT_N_THREADS 4 +#if UINTPTR_MAX == 0xFFFFFFFF + #define GGML_MEM_ALIGN 4 +#else + #define GGML_MEM_ALIGN 16 +#endif #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 #define GGUF_MAGIC 0x46554747 // "GGUF" -#define GGUF_VERSION 1 +#define GGUF_VERSION 2 #define GGUF_DEFAULT_ALIGNMENT 32 @@ -471,6 +479,9 @@ extern "C" { int64_t perf_cycles; int64_t perf_time_us; + struct ggml_tensor * view_src; + size_t view_offs; + void * data; char name[GGML_MAX_NAME]; @@ -653,7 +664,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); - GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); @@ -944,11 +955,11 @@ extern "C" { // a - x // b - dy - // TODO: update with configurable eps GGML_API struct ggml_tensor * ggml_rms_norm_back( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + struct ggml_tensor * b, + float eps); // A: n columns, m rows // B: n columns, p rows (i.e. we transpose it internally) @@ -1604,7 +1615,8 @@ extern "C" { struct ggml_tensor * tensor); - GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); @@ -1669,6 +1681,8 @@ extern "C" { GGML_LINESEARCH_INVALID_PARAMETERS, }; + typedef void (*ggml_opt_callback)(void * data, float * sched); + // optimization parameters // // see ggml.c (ggml_opt_default_params) for default values @@ -1704,12 +1718,14 @@ extern "C" { float sched; // schedule multiplier (fixed, decay or warmup) float decay; // weight decay for AdamW, use 0.0f to disable + int decay_min_ndim; // minimum number of tensor dimension to apply weight decay float alpha; // learning rate float beta1; float beta2; float eps; // epsilon for numerical stability float eps_f; // epsilon for convergence test float eps_g; // epsilon for convergence test + float gclip; // gradient clipping } adam; // LBFGS parameters @@ -1737,14 +1753,12 @@ extern "C" { bool just_initialized; + float loss_before; + float loss_after; + struct { - struct ggml_tensor * x; // view of the parameters - struct ggml_tensor * g1; // gradient - struct ggml_tensor * g2; // gradient squared struct ggml_tensor * m; // first moment struct ggml_tensor * v; // second moment - struct ggml_tensor * mh; // first moment hat - struct ggml_tensor * vh; // second moment hat struct ggml_tensor * pf; // past function values float fx_best; float fx_prev; @@ -1781,10 +1795,10 @@ extern "C" { // initialize optimizer context GGML_API void ggml_opt_init( - struct ggml_context * ctx, + struct ggml_context * ctx, struct ggml_opt_context * opt, - struct ggml_opt_params params, - int64_t nx); + struct ggml_opt_params params, + int64_t nx); // continue optimizing the function defined by the tensor f GGML_API enum ggml_opt_result ggml_opt_resume( @@ -1798,7 +1812,9 @@ extern "C" { struct ggml_opt_context * opt, struct ggml_tensor * f, struct ggml_cgraph * gf, - struct ggml_cgraph * gb); + struct ggml_cgraph * gb, + ggml_opt_callback callback, + void * callback_data); // // quantization @@ -1827,6 +1843,9 @@ extern "C" { GGUF_TYPE_BOOL = 7, GGUF_TYPE_STRING = 8, GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, GGUF_TYPE_COUNT, // marks the end of the enum }; @@ -1867,6 +1886,9 @@ extern "C" { GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i); GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i); GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i); + GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i); + GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i); + GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i); GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i); GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i); GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i); @@ -1886,6 +1908,9 @@ extern "C" { GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); @@ -1944,6 +1969,7 @@ extern "C" { GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); + GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_vsx (void); // diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index f4db7001b..838a2c0f8 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -13,7 +13,7 @@ from typing import Any, IO, List, Optional # GGUF_MAGIC = 0x46554747 -GGUF_VERSION = 1 +GGUF_VERSION = 2 GGUF_DEFAULT_ALIGNMENT = 32 # general @@ -365,6 +365,9 @@ class GGUFValueType(IntEnum): BOOL = 7 STRING = 8 ARRAY = 9 + UINT64 = 10 + INT64 = 11 + FLOAT64 = 12 @staticmethod def get_type(val): @@ -378,6 +381,7 @@ class GGUFValueType(IntEnum): return GGUFValueType.BOOL elif isinstance(val, int): return GGUFValueType.INT32 + # TODO: need help with 64-bit types in Python else: print("Unknown type: "+str(type(val))) sys.exit() @@ -400,8 +404,8 @@ class GGUFWriter: def write_header_to_file(self): self.fout.write(struct.pack(" -#include -#include #endif #include "llama.h" @@ -62,6 +59,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -114,12 +114,17 @@ static size_t utf8_len(char src) { } void replace_all(std::string & s, const std::string & search, const std::string & replace) { - for (size_t pos = 0; ; pos += replace.length()) { - pos = s.find(search, pos); - if (pos == std::string::npos) break; - s.erase(pos, search.length()); - s.insert(pos, replace); + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; } + s = std::move(result); } static void zeros(std::ofstream & file, size_t n) { @@ -796,12 +801,12 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } -static std::string llama_token_to_text(const struct llama_context * ctx, llama_token token) { +static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_str(ctx, token, result.data(), result.size()); + int check = llama_token_to_piece(ctx, token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -961,10 +966,10 @@ struct llama_vocab { id special_eot_id = 32010; int find_bpe_rank(std::string token_left, std::string token_right) const { - replace_all(token_left, " ", "Δ "); - replace_all(token_left, "\n", "Ċ"); - replace_all(token_right, " ", "Δ "); - replace_all(token_right, "\n", "Ċ"); + replace_all(token_left, " ", "\u0120"); + replace_all(token_left, "\n", "\u010A"); + replace_all(token_right, " ", "\u0120"); + replace_all(token_right, "\n", "\u010A"); auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); if (it == bpe_ranks.end()) { @@ -1150,11 +1155,13 @@ static bool llama_kv_cache_init( enum llama_fver { GGUF_FILE_VERSION_V1 = 1, + GGUF_FILE_VERSION_V2 = 2, }; static const char * llama_file_version_name(llama_fver version) { switch (version) { - case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)"; + case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)"; + case GGUF_FILE_VERSION_V2: return "GGUF V2 (latest)"; } return "unknown"; @@ -1641,7 +1648,8 @@ static void llm_load_hparams( } // TODO: This should probably be in llama.h -static std::vector llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape); +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos); +static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( llama_model_loader & ml, @@ -1743,7 +1751,11 @@ static void llm_load_vocab( } // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' - vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0]; + if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { + vocab.linefeed_id = llama_byte_to_token(vocab, '\n'); + } else { + vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0]; + } // special tokens GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); @@ -2641,18 +2653,20 @@ static struct ggml_cgraph * llm_build_falcon( const size_t wsize = ggml_type_size(cur->type); - struct ggml_tensor * tmpq = ggml_view_3d( + // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for + // non-contiguous views is added for the rope operator + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head, N, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0); + 0)); offload_func_kq(tmpq); - struct ggml_tensor * tmpk = ggml_view_3d( + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, N, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head); + wsize * n_embd_head * n_head)); offload_func_kq(tmpk); struct ggml_tensor * tmpv = ggml_view_3d( @@ -2837,7 +2851,6 @@ static bool llama_eval_internal( GGML_ASSERT(n_tokens > 0); GGML_ASSERT(n_past >= 0); - GGML_ASSERT(n_threads > 0); // TODO: keep the values of n_batch and n_ctx // GGML_ASSERT(n_tokens <= n_batch); // GGML_ASSERT(n_past + n_tokens <= n_ctx); @@ -2848,6 +2861,8 @@ static bool llama_eval_internal( ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads); #endif + GGML_ASSERT(n_threads > 0); + const int N = n_tokens; const auto & model = lctx.model; @@ -3032,16 +3047,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { return vocab.token_to_id.at(buf); } -static std::string llama_escape_whitespace(const std::string& text) { - std::string result = "\xe2\x96\x81"; - for (size_t offs = 0; offs < text.length(); ++offs) { - if (text[offs] == ' ') { - result += "\xe2\x96\x81"; - } else { - result += text[offs]; - } - } - return result; +static void llama_escape_whitespace(std::string & text) { + replace_all(text, " ", "\xe2\x96\x81"); } static void llama_unescape_whitespace(std::string & word) { @@ -3210,7 +3217,7 @@ private: struct llm_bigram_bpe { struct comparator { - bool operator()(llm_bigram_bpe & l, llm_bigram_bpe & r) { + bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const { return l.rank > r.rank || (l.rank == r.rank && l.left > r.left); } }; @@ -3225,7 +3232,7 @@ struct llm_bigram_bpe { }; struct llm_tokenizer_bpe { - llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; } + llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {} void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; @@ -3358,26 +3365,23 @@ private: } // probably not 100% correct - // TODO: this is quite slow - how to make it more efficient? - static std::vector bpe_gpt2_preprocess(std::string text) { + static std::vector bpe_gpt2_preprocess(const std::string & text) { std::vector words; // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; const std::regex re(pattern); - std::smatch m; - while (std::regex_search(text, m, re)) { - for (auto x : m) { - words.push_back(x); - } - text = m.suffix(); + auto words_begin = std::sregex_iterator(text.begin(), text.end(), re); + auto words_end = std::sregex_iterator(); + auto n_words = std::distance(words_begin, words_end); + words.reserve(n_words); + for (auto it = words_begin; it != words_end; ++it) { + words.push_back(it->str()); } - return words; - } - bool flag_g2ws = false; + } const llama_vocab & vocab; @@ -3387,9 +3391,18 @@ private: llm_bigram_bpe::queue work_queue; }; -static std::vector llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) { +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos) { std::vector output; + // OG tokenizer behavior: + // + // tokenizer.encode('', add_bos=True) returns [1] + // tokenizer.encode('', add_bos=False) returns [] + + if (bos && vocab.special_bos_id != -1) { + output.push_back(vocab.special_bos_id); + } + if (raw_text.empty()) { return output; } @@ -3397,29 +3410,16 @@ static std::vector llama_tokenize_internal(const llama_vocab & switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { + // without adding this leading whitespace, we do not get the same results as the original tokenizer + raw_text = " " + raw_text; + llm_tokenizer_spm tokenizer(vocab); - - if (bos) { - output.push_back(vocab.special_bos_id); - } - - std::string text; - if (escape) { - text = llama_escape_whitespace(raw_text); - } else { - text = raw_text; - } - - tokenizer.tokenize(text, output); + llama_escape_whitespace(raw_text); + tokenizer.tokenize(raw_text, output); } break; case LLAMA_VOCAB_TYPE_BPE: { - llm_tokenizer_bpe tokenizer(vocab, escape); - - if (bos && vocab.special_bos_id != -1) { - output.push_back(vocab.special_bos_id); - } - + llm_tokenizer_bpe tokenizer(vocab); tokenizer.tokenize(raw_text, output); } break; }; @@ -3914,7 +3914,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * // Calculate absolute value of second derivatives for (size_t i = 0; i < second_derivatives.size(); ++i) { - second_derivatives[i] = abs(second_derivatives[i]); + second_derivatives[i] = std::abs(second_derivatives[i]); } // Normalize the second derivatives @@ -4105,16 +4105,16 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c std::vector candidates_grammar; for (size_t i = 0; i < candidates->size; ++i) { - const llama_token id = candidates->data[i].id; - const std::string text = llama_token_to_text(ctx, id); + const llama_token id = candidates->data[i].id; + const std::string piece = llama_token_to_str(ctx, id); if (id == eos) { if (!allow_eos) { candidates->data[i].logit = -INFINITY; } - } else if (text.empty() || text[0] == 0) { + } else if (piece.empty() || piece[0] == 0) { candidates->data[i].logit = -INFINITY; } else { - candidates_decoded.push_back(decode_utf8(text.c_str(), grammar->partial_utf8)); + candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8)); candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second }); } } @@ -4318,10 +4318,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string text = llama_token_to_text(ctx, token); + const std::string piece = llama_token_to_str(ctx, token); // Note terminating 0 in decoded string - const auto decoded = decode_utf8(text.c_str(), grammar->partial_utf8); + const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); const auto & code_points = decoded.first; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); @@ -4680,6 +4680,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::unique_ptr ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + llama_model model; + llm_load_arch(*ml, model); + llm_load_hparams(*ml, model, 0, 0, 0); + const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); @@ -4705,6 +4709,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s ++n_feed_forward_w2; } } + if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) { + LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n", + __func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer); + } int i_attention_wv = 0; int i_feed_forward_w2 = 0; @@ -4781,8 +4789,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { int nx = tensor->ne[0]; - int ny = tensor->ne[1]; - if (nx % QK_K == 0 && ny % QK_K == 0) { + if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + new_type = GGML_TYPE_Q8_0; + } + else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } } else if (name.find("attn_v.weight") != std::string::npos) { @@ -4796,21 +4806,49 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; + if (model.type == MODEL_70B) { + // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is + // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with + // nearly negligible increase in model size by quantizing this tensor with more bits: + if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; + } ++i_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K + : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (model.arch == LLM_ARCH_FALCON) { + new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) { + new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K; ++i_feed_forward_w2; } else if (name.find("attn_output.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + if (model.arch != LLM_ARCH_FALCON) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + } + } + else if (name.find("attn_qkv.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; @@ -4825,8 +4863,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; - if (nx % QK_K != 0 || ny % QK_K != 0) { - LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); + if (nx % QK_K != 0) { + LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K); convert_incompatible_tensor = true; } } @@ -6117,8 +6155,7 @@ int llama_tokenize_with_model( llama_token * tokens, int n_max_tokens, bool add_bos) { - auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM; - auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape); + auto res = llama_tokenize_internal(model->vocab, text, add_bos); if (n_max_tokens < (int) res.size()) { LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); @@ -6132,12 +6169,12 @@ int llama_tokenize_with_model( return res.size(); } -int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) { - return llama_token_to_str_with_model(&ctx->model, token, buf, length); +int llama_token_to_piece(const struct llama_context * ctx, llama_token token, char * buf, int length) { + return llama_token_to_piece_with_model(&ctx->model, token, buf, length); } -// does not write null-terminator to str -int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { +// does not write null-terminator to buf +int llama_token_to_piece_with_model(const struct llama_model * model, llama_token token, char * buf, int length) { if (0 <= token && token < llama_model_n_vocab(model)) { if (llama_is_normal_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; @@ -6225,11 +6262,40 @@ const char * llama_print_system_info(void) { s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; + s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; return s.c_str(); } +void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) { + fprintf(stream, "\n"); + fprintf(stream, "###########\n"); + fprintf(stream, "# Timings #\n"); + fprintf(stream, "###########\n"); + fprintf(stream, "\n"); + + fprintf(stream, "mst_eval: %.2f # ms / token during generation\n", + 1.0e-3 * ctx->t_eval_us / ctx->n_eval); + fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n", + 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval); + fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n", + 1.0e-3 * ctx->t_sample_us / ctx->n_sample); + fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval); + fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval); + fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->n_sample); + fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us); + fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us); + fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us); + fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->t_sample_us); + fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n", + 1.0e6 * ctx->n_eval / ctx->t_eval_us); + fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n", + 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us); + fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n", + 1.0e6 * ctx->n_sample / ctx->t_sample_us); +} + // For internal test use const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; @@ -6240,10 +6306,6 @@ void llama_log_set(llama_log_callback log_callback, void * user_data) { g_state.log_callback_user_data = user_data; } -#if defined(_MSC_VER) && !defined(vsnprintf) -#define vsnprintf _vsnprintf -#endif - static void llama_log_internal_v(llama_log_level level, const char * format, va_list args) { va_list args_copy; va_copy(args_copy, args); diff --git a/llama.h b/llama.h index f15f63e0e..117daeba8 100644 --- a/llama.h +++ b/llama.h @@ -10,6 +10,7 @@ #endif // GGML_USE_CUBLAS #include #include +#include #include #ifdef LLAMA_SHARED @@ -387,15 +388,17 @@ extern "C" { int n_max_tokens, bool add_bos); - // Token Id -> String. Uses the vocabulary in the provided context - // Does not write null terminator to the buffer - LLAMA_API int llama_token_to_str( + // Token Id -> Piece. + // Uses the vocabulary in the provided context. + // Does not write null terminator to the buffer. + // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. + LLAMA_API int llama_token_to_piece( const struct llama_context * ctx, llama_token token, char * buf, int length); - LLAMA_API int llama_token_to_str_with_model( + LLAMA_API int llama_token_to_piece_with_model( const struct llama_model * model, llama_token token, char * buf, @@ -500,7 +503,7 @@ extern "C" { // Type of pointer to the beam_search_callback function. // void* callback_data is any custom data passed to llama_beam_search, that is subsequently // passed back to beam_search_callback. This avoids having to use global variables in the callback. - typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state); + typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state); /// @details Deterministically returns entire sentence constructed by a beam search. /// @param ctx Pointer to the llama_context. @@ -524,6 +527,8 @@ extern "C" { // If this is not called, or NULL is supplied, everything is output on stderr. LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); + LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); + #ifdef __cplusplus } #endif diff --git a/run_with_preset.py b/run_with_preset.py new file mode 100755 index 000000000..8f90f52a9 --- /dev/null +++ b/run_with_preset.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +import argparse +import os +import subprocess +import sys + +import yaml + +CLI_ARGS_MAIN_PERPLEXITY = [ + "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", + "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", + "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct", + "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base", + "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock", + "model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q", + "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt", + "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n", + "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed", + "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical", + "verbose-prompt" +] + +CLI_ARGS_LLAMA_BENCH = [ + "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers", + "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose" +] + +CLI_ARGS_SERVER = [ + "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base", + "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q", + "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split", + "threads", "verbose" +] + +description = """Run llama.cpp binaries with presets from YAML file(s). +To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported). +To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument. + +Formatting considerations: +- The YAML property names are the same as the CLI argument names of the corresponding binary. +- Properties must use the long name of their corresponding llama.cpp CLI arguments. +- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores. +- Flags must be defined as ": true" to be effective. +- To define the logit_bias property, the expected format is ": " in the "logit_bias" namespace. +- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings. +- To define a tensor split, pass a list of floats. +""" +usage = "run_with_preset.py [-h] [yaml_files ...] [-- ...]" +epilog = (" -- specify additional CLI ars to be passed to the binary (override all preset files). " + "Unknown args will be ignored.") + +parser = argparse.ArgumentParser( + description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter) +parser.add_argument("-bin", "--binary", help="The binary to run.") +parser.add_argument("yaml_files", nargs="*", + help="Arbitrary number of YAML files from which to read preset values. " + "If two files specify the same values the later one will be used.") + +known_args, unknown_args = parser.parse_known_args() + +if not known_args.yaml_files and not unknown_args: + parser.print_help() + sys.exit(0) + +props = dict() + +for yaml_file in known_args.yaml_files: + with open(yaml_file, "r") as f: + props.update(yaml.load(f, yaml.SafeLoader)) + +props = {prop.replace("_", "-"): val for prop, val in props.items()} + +binary = props.pop("binary", "main") +if known_args.binary: + binary = known_args.binary + +if os.path.exists(f"./{binary}"): + binary = f"./{binary}" + +if binary.lower().endswith("main") or binary.lower().endswith("perplexity"): + cli_args = CLI_ARGS_MAIN_PERPLEXITY +elif binary.lower().endswith("llama-bench"): + cli_args = CLI_ARGS_LLAMA_BENCH +elif binary.lower().endswith("server"): + cli_args = CLI_ARGS_SERVER +else: + print(f"Unknown binary: {binary}") + sys.exit(1) + +command_list = [binary] + +for cli_arg in cli_args: + value = props.pop(cli_arg, None) + + if not value or value == -1: + continue + + if cli_arg == "logit-bias": + for token, bias in value.items(): + command_list.append("--logit-bias") + command_list.append(f"{token}{bias:+}") + continue + + if cli_arg == "reverse-prompt" and not isinstance(value, str): + for rp in value: + command_list.append("--reverse-prompt") + command_list.append(str(rp)) + continue + + command_list.append(f"--{cli_arg}") + + if cli_arg == "tensor-split": + command_list.append(",".join([str(v) for v in value])) + continue + + value = str(value) + + if value != "True": + command_list.append(str(value)) + +num_unused = len(props) +if num_unused > 10: + print(f"The preset file contained a total of {num_unused} unused properties.") +elif num_unused > 0: + print("The preset file contained the following unused properties:") + for prop, value in props.items(): + print(f" {prop}: {value}") + +command_list += unknown_args + +sp = subprocess.Popen(command_list) + +while sp.returncode is None: + try: + sp.wait() + except KeyboardInterrupt: + pass + +sys.exit(sp.returncode) diff --git a/scripts/convert-gg.sh b/scripts/convert-gg.sh new file mode 100755 index 000000000..01fda16fd --- /dev/null +++ b/scripts/convert-gg.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +set -e + +# LLaMA v1 +python3 convert.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16 + +# LLaMA v2 +python3 convert.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16 + +# Code Llama +python3 convert.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16 +python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16 + +# Falcon +python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1 +mv -v ../falcon/falcon-7b/ggml-model-f16.gguf models/falcon-7b/ggml-model-f16.gguf + +python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-40b 1 +mv -v ../falcon/falcon-40b/ggml-model-f16.gguf models/falcon-40b/ggml-model-f16.gguf diff --git a/scripts/qnt-all.sh b/scripts/qnt-all.sh index 1b3d07da5..b4c2a159e 100755 --- a/scripts/qnt-all.sh +++ b/scripts/qnt-all.sh @@ -20,6 +20,9 @@ fi model="$1" out="../tmp/results-${model}" +set -o pipefail +set -e + mkdir -p ${out} for q in ${qnt[@]}; do diff --git a/scripts/run-all-perf.sh b/scripts/run-all-perf.sh index 91a6d853f..6384e364d 100755 --- a/scripts/run-all-perf.sh +++ b/scripts/run-all-perf.sh @@ -20,6 +20,9 @@ fi model="$1" out="../tmp/results-${model}" +set -o pipefail +set -e + mkdir -p ${out} mstr="" diff --git a/scripts/run-all-ppl.sh b/scripts/run-all-ppl.sh index 366d0866c..e04d61d7f 100755 --- a/scripts/run-all-ppl.sh +++ b/scripts/run-all-ppl.sh @@ -17,6 +17,9 @@ if [ ! -z "$3" ]; then args="$3" fi +set -o pipefail +set -e + model="$1" out="../tmp/results-${model}" diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2afaf86b1..ca1f39d31 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -25,8 +25,10 @@ endfunction() llama_build_and_test_executable(test-quantize-fns.cpp) llama_build_and_test_executable(test-quantize-perf.cpp) llama_build_and_test_executable(test-sampling.cpp) -llama_build_executable(test-tokenizer-0.cpp) -llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_build_executable(test-tokenizer-0-llama.cpp) +llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_build_executable(test-tokenizer-0-falcon.cpp) +#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_build_executable(test-tokenizer-1.cpp) # test-tokenizer-1 requires a BPE vocab. re-enable when we have one. #llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp index 75a698d73..468cde66a 100644 --- a/tests/test-grad0.cpp +++ b/tests/test-grad0.cpp @@ -275,14 +275,14 @@ static bool check_gradient( ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); - const float f0 = ggml_get_f32_1d(f, 0); + const double f0 = ggml_get_f32_1d(f, 0); ggml_set_f32_1d(x[i], k, xm); ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); - const float f1 = ggml_get_f32_1d(f, 0); - const float g0 = (f0 - f1)/(2.0f*eps); + const double f1 = ggml_get_f32_1d(f, 0); + const double g0 = (f0 - f1)/(2.0*(double) eps); ggml_set_f32_1d(x[i], k, x0); @@ -292,10 +292,10 @@ static bool check_gradient( ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); - const float g1 = ggml_get_f32_1d(x[i]->grad, k); + const double g1 = ggml_get_f32_1d(x[i]->grad, k); - const float error_abs = fabsf(g0 - g1); - const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0; + const double error_abs = fabs(g0 - g1); + const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0; if (error_abs > max_error_abs || error_rel > max_error_rel) { printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", @@ -531,7 +531,7 @@ int main(int argc, const char ** argv) { struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0])); - check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f); + check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f); } } @@ -1345,9 +1345,18 @@ int main(int argc, const char ** argv) { x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f); ggml_set_param(ctx0, x[0]); - struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0])); + float eps = 1e-6f; + // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work + // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0) + struct ggml_tensor * f = ggml_sum(ctx0, + ggml_log(ctx0, + ggml_add1(ctx0, + ggml_scale(ctx0, + ggml_soft_max(ctx0, x[0]), + ggml_new_f32(ctx0, 1.0f - eps)), + ggml_new_f32(ctx0, eps)))); - check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY); + check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY); } } @@ -1358,15 +1367,26 @@ int main(int argc, const char ** argv) { int64_t ne2[4]; get_random_dims(ne2, 4); - for (int ndims = 1; ndims <= 3; ++ndims) { - x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f); + for (int ndims = 1; ndims <= 4; ++ndims) { + x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f); x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f); + // the second argument to cross_entropy_loss must sum up to 1 for each row + int nr = ggml_nrows(x[1]); + int nc = ggml_nelements(x[1]) / nr; + for (int ir = 0; ir < nr; ++ir) { + float sum = 0; + for (int ic = 0; ic < nc; ++ic) { + sum += ((float *) x[1]->data)[ic + ir*nc]; + } + for (int ic = 0; ic < nc; ++ic) { + ((float *) x[1]->data)[ic + ir*nc] /= sum; + } + } ggml_set_param(ctx0, x[0]); - struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1])); + struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]); - check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY); - // finite differences regularly fails! + check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY); } } @@ -1473,7 +1493,7 @@ int main(int argc, const char ** argv) { struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); - check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f); + check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY); } } } @@ -1514,7 +1534,7 @@ int main(int argc, const char ** argv) { struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0))); - check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f); + check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY); } } } diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp new file mode 100644 index 000000000..836fb8ad2 --- /dev/null +++ b/tests/test-tokenizer-0-falcon.cpp @@ -0,0 +1,178 @@ +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-falcon.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 204, }, }, + { " " , { 258, }, }, + { " " , { 466, }, }, + { "\t" , { 192, }, }, + { "\n" , { 193, }, }, + { "\t\n" , { 19125, }, }, + { "Hello world" , { 9856, 1079, }, }, + { " Hello world" , { 23090, 1079, }, }, + { "Hello World" , { 9856, 2889, }, }, + { " Hello World" , { 23090, 2889, }, }, + { " Hello World!" , { 23090, 2889, 12, }, }, + { "Hello, world!" , { 9856, 23, 1079, 12, }, }, + { " Hello, world!" , { 23090, 23, 1079, 12, }, }, + { " this is πŸ¦™.cpp" , { 414, 304, 3346, 111, 231, 25, 29247, }, }, + { "w048 7tuijk dsdfhu" , { 98, 55866, 204, 34, 16682, 7149, 36190, 6869, 11481, }, }, + { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 150, 133, 6207, 151, 215, 150, 134, 5052, 133, 6279, 5052, 223, 151, 216, 49679, 123, 53110, 47043, 7795, }, }, + { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 38154, 206, 38154, 126, 38154, 225, 167, 237, 217, 38154, 221, 167, 237, 208, 38154, 228, 38154, 127, 38154, 237, 167, 237, 207, 38154, 237, 38154, 107, 38154, 126, 38154, 211, 38154, 207, 38154, 233, 38154, 211, 167, 237, 207, 38154, 215, }, }, + { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 2571, 232, 206, 204, 19, 11003, 20, 8196, 126, 283, 219, 48778, 116, 13392, 204, 19, 51831, 732, 63209, 1741, 7955, 522, 20, 22438, 211, 204, 19, 7927, 53360, 325, 504, 701, 946, 10930, 20, }, }, + { "Hello" , { 9856, }, }, + { " Hello" , { 23090, }, }, + { " Hello" , { 204, 23090, }, }, + { " Hello" , { 258, 23090, }, }, + { " Hello" , { 466, 23090, }, }, + { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_BPE) { + fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); + llama_free_model(model); + llama_free(ctx); + return 2; + } + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_bpe(ctx, res).c_str(), + llama_detokenize_bpe(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py new file mode 100644 index 000000000..9c8c1c7d1 --- /dev/null +++ b/tests/test-tokenizer-0-falcon.py @@ -0,0 +1,83 @@ +# tests with BPE tokenizer + +import os +import sys +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + +for text in tests: + print('text: ', text) + print(tokenizer.encode(text)) + print(tokenizer.decode(tokenizer.encode(text))) + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp new file mode 100644 index 000000000..8630742c6 --- /dev/null +++ b/tests/test-tokenizer-0-llama.cpp @@ -0,0 +1,182 @@ +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include + +// generate using test-tokenizer-0-llama.py +static const std::map> & k_tests() { + static std::map> _k_tests = { + { "" , { }, }, + { " " , { 259, }, }, + { " " , { 1678, }, }, + { " " , { 268, }, }, + { "\t" , { 29871, 12, }, }, + { "\n" , { 29871, 13, }, }, + { "\t\n" , { 29871, 12, 13, }, }, + { "Hello world" , { 15043, 3186, }, }, + { " Hello world" , { 29871, 15043, 3186, }, }, + { "Hello World" , { 15043, 2787, }, }, + { " Hello World" , { 29871, 15043, 2787, }, }, + { " Hello World!" , { 29871, 15043, 2787, 29991, }, }, + { "Hello, world!" , { 15043, 29892, 3186, 29991, }, }, + { " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, }, + { " this is πŸ¦™.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, + { "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, + { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ" , { 1538, 4851, 665, 1386, 29713, 1305, }, }, + { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, }, + { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, + { "Hello" , { 15043, }, }, + { " Hello" , { 29871, 15043, }, }, + { " Hello" , { 259, 15043, }, }, + { " Hello" , { 1678, 15043, }, }, + { " Hello" , { 268, 15043, }, }, + { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, + }; + + return _k_tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(false); + + // load the vocab + { + auto lparams = llama_context_default_params(); + + lparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + + if (llama_vocab_type(ctx) != LLAMA_VOCAB_TYPE_SPM) { + fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); + llama_free_model(model); + llama_free(ctx); + return 2; + } + + bool success = true; + + for (const auto & test_kv : k_tests()) { + const std::vector res_bos = llama_tokenize(ctx, test_kv.first, true); + const std::vector res_nobos = llama_tokenize(ctx, test_kv.first, false); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str()); + printf("tok: "); + for (const auto & tok : res_bos) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1; + + for (int i = 0; i < (int) res_nobos.size() && correct; ++i) { + if (test_kv.second[i] != res_bos[i + 1]) { + correct = false; + } + if (test_kv.second[i] != res_nobos[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_spm(ctx, res_nobos).c_str(), + llama_detokenize_spm(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res_nobos) { + fprintf(stderr, "%6d, ", t); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + const std::vector res = llama_tokenize(ctx, text, true); + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + ofs << tok << " "; + } + + ofs << "\n"; + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py new file mode 100644 index 000000000..bc164ee29 --- /dev/null +++ b/tests/test-tokenizer-0-llama.py @@ -0,0 +1,95 @@ +# tests with SPM tokenizer + +import os +import sys +import argparse + +from sentencepiece import SentencePieceProcessor + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize") +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer + +tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') + +tests = [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is πŸ¦™.cpp", + "w048 7tuijk dsdfhu", + "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", + "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", + "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + ] + + +for text in tests: + print('text: ', text) + print('\nwith bos:') + print(tokenizer.encode(text, add_bos=True)) + print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) + print('\nwithout bos:') + print(tokenizer.encode(text, add_bos=False)) + print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) + +print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' +print("'" + tokenizer.id_to_piece(29871) + "'") # '_' +print("'" + tokenizer.decode([15043]) + "'") # 'Hello' +print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' +print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' +print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' + +print("\n\ntests for C++:\n") +for text in tests: + res = tokenizer.encode(text, add_bos=False) + + k = text.replace('\n', '\\n') + k = k.replace('\t', '\\t') + k = '"' + k + '"' + print("{ %-24s, { " % k, end='') + for x in res: + print("%7d," % x, end='') + print(" }, },") + +print(tokenizer.encode('hello')) +print(tokenizer.encode('world')) +print(tokenizer.encode(' world')) +print(tokenizer.encode('hello world')) + +fname_tok = args.fname_tok +if fname_tok: + print('tokenizing file: ', fname_tok) + fname_out = fname_tok + '.tok' + with open(fname_tok, 'r') as f: + lines = f.readlines() + s = ''.join(lines) + res = tokenizer.encode(s, add_bos=True) + # write to file + with open(fname_out, 'w') as f: + for x in res: + f.write(str(x) + ' ') + f.write('\n') + print('len(res): ', len(res)) + print('len(lines): ', len(lines)) + print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp deleted file mode 100644 index f3ee851a3..000000000 --- a/tests/test-tokenizer-0.cpp +++ /dev/null @@ -1,140 +0,0 @@ -#include "llama.h" -#include "common.h" - -#include -#include -#include -#include - -static std::string unescape_whitespace(llama_context* ctx, const std::vector& tokens) { - std::string result; - for (size_t i = 0; i < tokens.size(); ++i) { - result += llama_token_to_str(ctx, tokens[i]); - } - return result; -} - -static const std::map> & k_tests() { - static std::map> _k_tests = { - { " ", {1, 259, }, }, - { " ", { 1, 1678, }, }, - { " ", { 1, 268, }, }, - { "\t", { 1, 29871, 12, }, }, - { "\n", { 1, 29871, 13, }, }, - { "\t\n", { 1, 29871, 12, 13, }, }, - { "Hello world", { 1, 15043, 3186, }, }, - { " Hello world", { 1, 29871, 15043, 3186, }, }, - { "Hello World", { 1, 15043, 2787, }, }, - { " Hello World", { 1, 29871, 15043, 2787, }, }, - { " Hello World!", { 1, 29871, 15043, 2787, 29991, }, }, - { " this is πŸ¦™.cpp", { 1, 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - { "w048 7tuijk dsdfhu", { 1, 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - { "Π½Π΅Ρ‰ΠΎ Π½Π° Π‘ΡŠΠ»Π³Π°Ρ€ΡΠΊΠΈ", { 1, 1538, 4851, 665, 1386, 29713, 1305, }, }, - { "αž€αžΆαž“αŸ‹αžαŸ‚αž–αž·αžŸαŸαžŸαž’αžΆαž…αžαž›αž…αŸαž‰", { 1, 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, - 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, - 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, - 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, - 136, 228, 162, 132, 228, 161, 140, }, }, - { "πŸš€ (normal) πŸ˜Άβ€πŸŒ«οΈ (multiple emojis concatenated) βœ… (only emoji that has its own token)", - { 1, 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, - 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, - 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, - 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, - { "Hello", { 1, 15043 }, }, - { " Hello", { 1, 29871, 15043 }, }, - { " Hello", { 1, 259, 15043 }, }, - { " Hello", { 1, 1678, 15043 }, }, - { " Hello", { 1, 268, 15043 }, }, - { " Hello\n Hello", { 1, 268, 15043, 13, 1678, 15043 }, }, - }; - - return _k_tests; -} - -int main(int argc, char **argv) { - if (argc < 2) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - - const std::string fname = argv[1]; - - fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); - - llama_model * model; - llama_context * ctx; - - llama_backend_init(false); - - // load the vocab - { - auto lparams = llama_context_default_params(); - - lparams.vocab_only = true; - - model = llama_load_model_from_file(fname.c_str(), lparams); - - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - return 1; - } - - ctx = llama_new_context_with_model(model, lparams); - - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); - return 1; - } - } - - const int n_vocab = llama_n_vocab(ctx); - - if (n_vocab != 32000) { - fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab); - llama_free_model(model); - llama_free(ctx); - return 2; - } - - bool success = true; - - for (const auto & test_kv : k_tests()) { - std::vector res = llama_tokenize(ctx, test_kv.first, true); - fprintf(stderr, "%s : '%s' tokenized to '%s'\n", - __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); - - bool correct = res.size() == test_kv.second.size(); - - for (int i = 0; i < (int) res.size() && correct; ++i) { - if (res[i] != test_kv.second[i]) { - correct = false; - } - } - - if (!correct) { - fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); - fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - unescape_whitespace(ctx, res).c_str(), unescape_whitespace(ctx, test_kv.second).c_str()); - fprintf(stderr, "%s : expected tokens: ", __func__); - for (const auto & t : test_kv.second) { - fprintf(stderr, "%6d, ", t); - } - fprintf(stderr, "\n"); - fprintf(stderr, "%s : got tokens: ", __func__); - for (const auto & t : res) { - fprintf(stderr, "%6d, ", t); - } - fprintf(stderr, "\n"); - - success = false; - } - } - - llama_free_model(model); - llama_free(ctx); - - llama_backend_free(); - - return success ? 0 : 3; -} diff --git a/tests/test-tokenizer-1.cpp b/tests/test-tokenizer-1.cpp index bd607d12b..ce4f2898c 100644 --- a/tests/test-tokenizer-1.cpp +++ b/tests/test-tokenizer-1.cpp @@ -22,14 +22,6 @@ static std::string escape_whitespace(const std::string& text) { return result; } -static std::string unescape_whitespace(llama_context * ctx, const std::vector & tokens) { - std::string result; - for (size_t i = 0; i < tokens.size(); ++i) { - result += llama_token_to_str(ctx, tokens[i]); - } - return result; -} - int main(int argc, char **argv) { if (argc < 2) { fprintf(stderr, "Usage: %s \n", argv[0]); @@ -72,13 +64,13 @@ int main(int argc, char **argv) { const int n_vocab = llama_n_vocab(ctx); for (int i = 0; i < n_vocab; ++i) { - std::string forward = llama_token_to_str(ctx, i); + std::string forward = llama_token_to_piece(ctx, i); std::vector tokens = llama_tokenize(ctx, forward, false); if (tokens.size() == 1) { if (i != tokens[0]) { - std::string backward = llama_token_to_str(ctx, tokens[0]); + std::string backward = llama_token_to_piece(ctx, tokens[0]); fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n", - __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str()); + __func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str()); return 2; } }