diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index 2a7da586a..61f671465 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 5cbd2e7a1..680d1cb92 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 6f19afa9c..2a06f82b7 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile index bff946cbc..8eda63a89 100644 --- a/.devops/llama-cli-cuda.Dockerfile +++ b/.devops/llama-cli-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER # Target the CUDA runtime image ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all @@ -25,7 +25,7 @@ ENV GGML_CUDA=1 RUN make -j$(nproc) llama-cli -FROM ${BASE_CUDA_RUN_CONTAINER} as runtime +FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libgomp1 diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile index bd816f9f5..79dba06a7 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -1,6 +1,6 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 -FROM intel/oneapi-basekit:$ONEAPI_VERSION as build +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF RUN apt-get update && \ @@ -14,10 +14,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ - cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + echo "Building with static libs" && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ + ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ cmake --build build --config Release --target llama-cli -FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime COPY --from=build /app/build/bin/llama-cli /llama-cli diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile index caa507b08..c3d1ab067 100644 --- a/.devops/llama-cli-rocm.Dockerfile +++ b/.devops/llama-cli-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile index 6155d5881..9b0dad8bf 100644 --- a/.devops/llama-cli-vulkan.Dockerfile +++ b/.devops/llama-cli-vulkan.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=jammy -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build # Install build tools RUN apt update && apt install -y git build-essential cmake wget libgomp1 diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile index 38382bfc9..7f741aa46 100644 --- a/.devops/llama-cli.Dockerfile +++ b/.devops/llama-cli.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y build-essential git @@ -11,7 +11,7 @@ COPY . . RUN make -j$(nproc) llama-cli -FROM ubuntu:$UBUNTU_VERSION as runtime +FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ apt-get install -y libgomp1 diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index d7eaa0925..67328cf1c 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER # Target the CUDA runtime image ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all @@ -27,7 +27,7 @@ ENV LLAMA_CURL=1 RUN make -j$(nproc) llama-server -FROM ${BASE_CUDA_RUN_CONTAINER} as runtime +FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index 8f8fef8c0..f525658dd 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -1,6 +1,6 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 -FROM intel/oneapi-basekit:$ONEAPI_VERSION as build +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF RUN apt-get update && \ @@ -14,10 +14,11 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ + echo "Building with dynamic libs" && \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-server -FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev curl diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index af96c3325..763b4cd3f 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 49062f84b..13a61ffd8 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=jammy -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build # Install build tools RUN apt update && apt install -y git build-essential cmake wget diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index a53a5c999..b631d5806 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y build-essential git libcurl4-openssl-dev curl @@ -13,7 +13,7 @@ ENV LLAMA_CURL=1 RUN make -j$(nproc) llama-server -FROM ubuntu:$UBUNTU_VERSION as runtime +FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix index 897fce4d3..0ecf19fc5 100644 --- a/.devops/nix/apps.nix +++ b/.devops/nix/apps.nix @@ -10,7 +10,6 @@ "llama-embedding" "llama-server" "llama-quantize" - "llama-train-text-from-scratch" ]; mkApp = name: { type = "app"; diff --git a/.devops/tools.sh b/.devops/tools.sh index cf0e8f32d..24dcfd350 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then ./llama-cli "$@" -elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then - ./llama-finetune "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -36,8 +34,6 @@ else echo " ex: --outtype f16 \"/models/7B/\" " echo " --quantize (-q): Optimize with quantization process ggml" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" - echo " --finetune (-f): Run finetune command to create a lora finetune of the model" - echo " See documentation for finetune for command-line parameters" echo " --all-in-one (-a): Execute --convert & --quantize" echo " ex: \"/models/\" 7B" echo " --server (-s): Run a model on the server" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1f275603a..b9246659a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -860,6 +860,7 @@ jobs: mkdir build cd build cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON + cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name diff --git a/.gitignore b/.gitignore index 7c7dee0c6..c9b4d9983 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ build* !docs/build.md /libllama.so /llama-* +/vulkan-shaders-gen android-ndk-* arm_neon.h cmake-build-* diff --git a/CMakeLists.txt b/CMakeLists.txt index 1afc63c63..856be961d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) +llama_option_depr(WARNING LLAMA_CANN GGML_CANN) llama_option_depr(WARNING LLAMA_QNN GGML_QNN) # diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9ad516049..b688f78ec 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,13 +1,17 @@ -# Pull requests +# Pull requests (for contributors) -- Always squash-merge the PR before merging -- Use the following format for your final commit: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` - Test your changes: - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library - Execute [the full CI locally on your machine](ci/README.md) before publishing -- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. - - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience + - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience +- If your PR becomes stale, don't hesitate to ping the maintainers in the comments + +# Pull requests (for collaborators) + +- Squash-merge PRs +- Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` +- Optionally, pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules # Coding guidelines diff --git a/Makefile b/Makefile index 4584594af..c82f4268a 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,6 @@ BUILD_TARGETS = \ llama-embedding \ llama-eval-callback \ llama-export-lora \ - llama-finetune \ llama-gbnf-validator \ llama-gguf \ llama-gguf-hash \ @@ -37,7 +36,6 @@ BUILD_TARGETS = \ llama-simple \ llama-speculative \ llama-tokenize \ - llama-train-text-from-scratch \ llama-vdot \ llama-cvector-generator \ tests/test-c.o @@ -64,13 +62,13 @@ TEST_TARGETS = \ tests/test-tokenizer-1-spm # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned -LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ +LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm + retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. -LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune +LEGACY_TARGETS_BUILD = main quantize perplexity embedding server # Deprecation aliases ifdef LLAMA_CUBLAS @@ -327,9 +325,9 @@ ifdef LLAMA_DEBUG endif else MK_CPPFLAGS += -DNDEBUG - MK_CFLAGS += -O3 - MK_CXXFLAGS += -O3 - MK_NVCCFLAGS += -O3 + MK_CFLAGS += -O3 -g + MK_CXXFLAGS += -O3 -g + MK_NVCCFLAGS += -O3 -g endif ifdef LLAMA_SANITIZE_THREAD @@ -530,10 +528,21 @@ ifndef GGML_NO_ACCELERATE endif endif # GGML_NO_ACCELERATE +ifdef GGML_MUSA + CC := clang + CXX := clang++ + GGML_CUDA := 1 + MK_CPPFLAGS += -DGGML_USE_MUSA +endif + ifndef GGML_NO_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp + ifdef GGML_MUSA + MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp + MK_LDFLAGS += -L/usr/lib/llvm-10/lib + endif # GGML_MUSA endif # GGML_NO_OPENMP ifdef GGML_OPENBLAS @@ -584,15 +593,27 @@ else endif # GGML_CUDA_FA_ALL_QUANTS ifdef GGML_CUDA - ifneq ('', '$(wildcard /opt/cuda)') - CUDA_PATH ?= /opt/cuda - else - CUDA_PATH ?= /usr/local/cuda - endif + ifdef GGML_MUSA + ifneq ('', '$(wildcard /opt/musa)') + CUDA_PATH ?= /opt/musa + else + CUDA_PATH ?= /usr/local/musa + endif - MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS - MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib - MK_NVCCFLAGS += -use_fast_math + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include + MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64 + MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22 + else + ifneq ('', '$(wildcard /opt/cuda)') + CUDA_PATH ?= /opt/cuda + else + CUDA_PATH ?= /usr/local/cuda + endif + + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS + MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib + MK_NVCCFLAGS += -use_fast_math + endif # GGML_MUSA OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) @@ -602,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings endif # LLAMA_FATAL_WARNINGS +ifndef GGML_MUSA ifndef JETSON_EOL_MODULE_DETECT MK_NVCCFLAGS += --forward-unknown-to-host-compiler endif # JETSON_EOL_MODULE_DETECT +endif # GGML_MUSA ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo @@ -617,8 +640,12 @@ endif # GGML_CUDA_DEBUG ifdef GGML_CUDA_NVCC NVCC = $(CCACHE) $(GGML_CUDA_NVCC) else - NVCC = $(CCACHE) nvcc -endif #GGML_CUDA_NVCC + ifdef GGML_MUSA + NVCC = $(CCACHE) mcc + else + NVCC = $(CCACHE) nvcc + endif # GGML_MUSA +endif # GGML_CUDA_NVCC ifdef CUDA_DOCKER_ARCH MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) @@ -689,9 +716,15 @@ define NVCC_COMPILE $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE else + ifdef GGML_MUSA +define NVCC_COMPILE + $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@ +endef # NVCC_COMPILE + else define NVCC_COMPILE $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE + endif # GGML_MUSA endif # JETSON_EOL_MODULE_DETECT ggml/src/ggml-cuda/%.o: \ @@ -876,6 +909,9 @@ OBJ_GGML += \ OBJ_LLAMA = \ src/llama.o \ + src/llama-vocab.o \ + src/llama-grammar.o \ + src/llama-sampling.o \ src/unicode.o \ src/unicode-data.o @@ -943,6 +979,7 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef GGML_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') +ifndef GGML_MUSA ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH @@ -952,6 +989,7 @@ endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) +endif # GGML_MUSA endif # GGML_CUDA $(info ) @@ -1055,6 +1093,10 @@ src/unicode-data.o: \ src/llama.o: \ src/llama.cpp \ + src/llama-impl.h \ + src/llama-vocab.h \ + src/llama-grammar.h \ + src/llama-sampling.h \ src/unicode.h \ include/llama.h \ ggml/include/ggml-cuda.h \ @@ -1064,6 +1106,29 @@ src/llama.o: \ ggml/include/ggml-backend.h $(CXX) $(CXXFLAGS) -c $< -o $@ +src/llama-vocab.o: \ + src/llama-vocab.cpp \ + src/llama-vocab.h \ + src/llama-impl.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-grammar.o: \ + src/llama-grammar.cpp \ + src/llama-grammar.h \ + src/llama-impl.h \ + src/llama-vocab.h \ + src/llama-sampling.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-sampling.o: \ + src/llama-sampling.cpp \ + src/llama-sampling.h \ + src/llama-impl.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + $(LIB_LLAMA): \ $(OBJ_LLAMA) \ $(LIB_GGML) @@ -1266,11 +1331,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ $(OBJ_GGML) $(OBJ_LLAMA) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) @@ -1286,13 +1346,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-finetune: examples/finetune/finetune.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-export-lora: examples/export-lora/export-lora.cpp \ - $(OBJ_GGML) common/log.h + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1439,7 +1494,7 @@ run-benchmark-matmult: llama-benchmark-matmult .PHONY: run-benchmark-matmult swift tests/test-llama-grammar: tests/test-llama-grammar.cpp \ - $(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1548,7 +1603,7 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. # # Mark legacy binary targets as .PHONY so that they are always checked. -.PHONY: main quantize perplexity embedding server finetune +.PHONY: main quantize perplexity embedding server # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # Eventually we will want to remove these target from building all the time. @@ -1591,13 +1646,3 @@ ifneq (,$(wildcard embedding)) @echo " Remove the 'embedding' binary to remove this warning." @echo "#########" endif - -finetune: examples/deprecation-warning/deprecation-warning.cpp -ifneq (,$(wildcard finetune)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead." - @echo " Remove the 'finetune' binary to remove this warning." - @echo "#########" -endif diff --git a/Package.swift b/Package.swift index d40a48385..1d90b47bf 100644 --- a/Package.swift +++ b/Package.swift @@ -4,6 +4,9 @@ import PackageDescription var sources = [ "src/llama.cpp", + "src/llama-vocab.cpp", + "src/llama-grammar.cpp", + "src/llama-sampling.cpp", "src/unicode.cpp", "src/unicode-data.cpp", "ggml/src/ggml.c", diff --git a/README.md b/README.md index 058919068..775ce2c88 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) +[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) @@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well. Unless otherwise noted these projects are open-source with permissive licensing: +- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) - [iohub/collama](https://github.com/iohub/coLLaMA) - [janhq/jan](https://github.com/janhq/jan) (AGPL) - [nat/openplayground](https://github.com/nat/openplayground) @@ -181,6 +182,9 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp +**Games:** +- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. + ## Demo
@@ -405,6 +409,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md) | [BLAS](./docs/build.md#blas-build) | All | | [BLIS](./docs/backend/BLIS.md) | All | | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [MUSA](./docs/build.md#musa) | Moore Threads GPU | | [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [Vulkan](./docs/build.md#vulkan) | GPU | diff --git a/common/common.cpp b/common/common.cpp index dbb724fbb..60c7eac75 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -694,11 +694,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); return true; } - if (arg == "--lora-base") { - CHECK_ARG - params.lora_base = argv[i]; - return true; - } if (arg == "--control-vector") { CHECK_ARG params.control_vectors.push_back({ 1.0f, argv[i], }); @@ -1274,6 +1269,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa CHECK_ARG params.out_file = argv[i]; params.cvector_outfile = argv[i]; + params.lora_outfile = argv[i]; return true; } if (arg == "-ofreq" || arg == "--output-frequency") { @@ -1328,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "--no-warmup") { + params.warmup = false; + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1450,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" }); options.push_back({ "server infill", " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); @@ -1583,9 +1584,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); - options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); - options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); - options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); + options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" "note: this argument can be repeated to add multiple control vectors" }); options.push_back({ "*", " --control-vector-scaled FNAME SCALE", @@ -1676,6 +1676,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); + options.push_back({ "export-lora" }); + options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); + options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads }); + options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); + printf("usage: %s [options]\n", argv[0]); for (const auto & o : options) { @@ -2721,7 +2728,7 @@ std::string llama_chat_format_single(const struct llama_model * model, const llama_chat_msg & new_msg, bool add_ass) { std::ostringstream ss; - auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); + auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false); std::vector chat_new(past_msg); // if the past_msg ends with a newline, we must preserve it in the formatted version if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { @@ -3166,7 +3173,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } - fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/common/common.h b/common/common.h index 184a53dc0..8240ff99b 100644 --- a/common/common.h +++ b/common/common.h @@ -128,7 +128,6 @@ struct gpt_params { // TODO: avoid tuple, use struct std::vector> lora_adapter; // lora adapter path with user defined scale - std::string lora_base = ""; // base model path for the lora adapter std::vector control_vectors; // control vector with user defined scale @@ -255,6 +254,8 @@ struct gpt_params { std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; bool spm_infill = false; // suffix/prefix/middle pattern for infill + + std::string lora_outfile = "ggml-lora-merged-f16.gguf"; }; void gpt_params_handle_hf_token(gpt_params & params); diff --git a/common/ngram-cache.h b/common/ngram-cache.h index e4fa4cbd1..ab4c9b376 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -37,11 +37,18 @@ struct llama_ngram { } }; +struct llama_token_hash_function { + size_t operator()(const llama_token token) const { + // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ + return token * 11400714819323198485llu; + } +}; + struct llama_ngram_hash_function { size_t operator()(const llama_ngram & ngram) const { - size_t hash = 0; - for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - hash ^= std::hash{}(ngram.tokens[i]); + size_t hash = llama_token_hash_function{}(ngram.tokens[0]); + for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { + hash ^= llama_token_hash_function{}(ngram.tokens[i]); } return hash; } diff --git a/common/sampling.cpp b/common/sampling.cpp index 6a483c815..079e40516 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl( llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; // Apply grammar constraints to the single token - llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar); + llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array); // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY bool is_valid = single_token_data_array.data[0].logit != -INFINITY; @@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl( // apply grammar checks before sampling logic if (apply_grammar && ctx_sampling->grammar != NULL) { - llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); + llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p); } return cur_p; @@ -455,6 +455,6 @@ void llama_sampling_accept( ctx_sampling->prev.push_back(id); if (ctx_sampling->grammar != NULL && apply_grammar) { - llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id); + llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id); } } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c2aba9097..8ba3c5844 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -48,34 +48,39 @@ class Model: dir_model: Path ftype: gguf.LlamaFileType + fname_out: Path is_big_endian: bool endianess: gguf.GGUFEndian use_temp_file: bool lazy: bool - model_name: str | None part_names: list[str] is_safetensors: bool hparams: dict[str, Any] block_count: int tensor_map: gguf.TensorNameMap tensor_names: set[str] | None - fname_out: Path gguf_writer: gguf.GGUFWriter + model_name: str | None + metadata_override: Path | None + dir_model_card: Path # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, - model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, + use_temp_file: bool = False, eager: bool = False, + metadata_override: Path | None = None, model_name: str | None = None, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + self.dir_model = dir_model self.ftype = ftype + self.fname_out = fname_out self.is_big_endian = is_big_endian self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file self.lazy = not eager - self.model_name = model_name self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: @@ -84,6 +89,11 @@ class Model: self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None + self.metadata_override = metadata_override + self.model_name = model_name + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) @@ -93,10 +103,8 @@ class Model: else: logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - ftype_up: str = self.ftype.name.partition("_")[2].upper() - ftype_lw: str = ftype_up.lower() - # allow templating the file name with the output ftype, useful with the "auto" ftype - self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) + + # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @@ -193,7 +201,6 @@ class Model: return new_name def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: @@ -232,6 +239,10 @@ class Model: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") @@ -250,7 +261,7 @@ class Model: return False - def write_tensors(self): + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in self.get_tensors(): @@ -333,9 +344,62 @@ class Model: self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def prepare_metadata(self, vocab_only: bool): + + total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + + self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + + # Fallback to model directory name if metadata name is still missing + if self.metadata.name is None: + self.metadata.name = self.dir_model.name + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model tokenizer") + self.set_vocab() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + def write(self): - self.write_tensors() - self.gguf_writer.write_header_to_file(self.fname_out) + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() @@ -343,7 +407,9 @@ class Model: def write_vocab(self): if len(self.gguf_writer.tensors) != 1: raise ValueError('Splitting the vocabulary is not supported') - self.gguf_writer.write_header_to_file(self.fname_out) + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @@ -528,6 +594,15 @@ class Model: if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": # ref: https://huggingface.co/core42/jais-13b res = "jais" + if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": + # ref: https://huggingface.co/WisdomShell/CodeShell-7B + res = "codeshell" + if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": + # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 + res = "tekken" + if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": + # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M + res = "smollm" if res is None: logger.warning("\n") @@ -668,7 +743,7 @@ class Model: added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -685,7 +760,8 @@ class Model: token_id = int(token_id) token: str = token_data["content"] if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token.encode("utf-8") + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: @@ -780,7 +856,6 @@ class GPTNeoXModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -836,7 +911,6 @@ class BloomModel(Model): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) @@ -913,7 +987,6 @@ class MPTModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_block_count(block_count) @@ -952,7 +1025,6 @@ class OrionModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -965,8 +1037,6 @@ class OrionModel(Model): raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -990,7 +1060,6 @@ class BaichuanModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -1002,8 +1071,6 @@ class BaichuanModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1117,7 +1184,6 @@ class XverseModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -1129,8 +1195,6 @@ class XverseModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1189,7 +1253,6 @@ class FalconModel(Model): if n_head_kv is None: n_head_kv = self.hparams.get("n_head_kv", 1) # old name - self.gguf_writer.add_name("Falcon") self.gguf_writer.add_context_length(2048) # not in config.json self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1234,7 +1297,6 @@ class StarCoderModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("StarCoder") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -1258,6 +1320,7 @@ class RefactModel(Model): special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -1269,7 +1332,6 @@ class RefactModel(Model): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("Refact") # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1324,7 +1386,6 @@ class StableLMModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1386,8 +1447,8 @@ class StableLMModel(Model): return [(new_name, data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._q_norms is not None or self._k_norms is not None: # flatten two `list[dict[str, Tensor]]` into a single `list[str]` @@ -1430,7 +1491,12 @@ class LlamaModel(Model): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": @@ -1503,8 +1569,36 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) + + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -1567,7 +1661,6 @@ class GrokModel(Model): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_name("Grok") _experts: list[dict[str, Tensor]] | None = None @@ -1616,7 +1709,6 @@ class DbrxModel(Model): def set_gguf_parameters(self): ffn_config = self.hparams["ffn_config"] attn_config = self.hparams["attn_config"] - self.gguf_writer.add_name(self.hparams["model_type"]) self.gguf_writer.add_block_count(self.hparams["n_layers"]) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) @@ -1685,7 +1777,6 @@ class MiniCPMModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1755,7 +1846,6 @@ class QwenModel(Model): self._set_vocab_qwen() def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1831,8 +1921,8 @@ class Qwen2MoeModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -1846,7 +1936,6 @@ class GPT2Model(Model): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_ctx"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1889,7 +1978,6 @@ class Phi2Model(Model): n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_name("Phi2") self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) @@ -1951,7 +2039,7 @@ class Phi3MiniModel(Model): for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -1968,7 +2056,8 @@ class Phi3MiniModel(Model): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -1984,7 +2073,8 @@ class Phi3MiniModel(Model): token_id = int(foken_data["id"]) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2011,7 +2101,6 @@ class Phi3MiniModel(Model): orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) rope_dims = n_embd // n_head - self.gguf_writer.add_name("Phi3") self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) self.gguf_writer.add_embedding_length(n_embd) @@ -2023,10 +2112,11 @@ class Phi3MiniModel(Model): self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(['rope_scaling'], True) - if (rope_scaling is None): + if rope_scaling is None: return scale = max_pos_embds / orig_max_pos_embds @@ -2068,7 +2158,6 @@ class PlamoModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name("PLaMo") self.gguf_writer.add_context_length(4096) # not in config.json self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) @@ -2113,7 +2202,6 @@ class CodeShellModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("CodeShell") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -2226,7 +2314,8 @@ class InternLM2Model(Model): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2245,7 +2334,8 @@ class InternLM2Model(Model): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2272,7 +2362,6 @@ class InternLM2Model(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -2432,6 +2521,7 @@ class GemmaModel(Model): special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("eot", 107) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) @@ -2440,7 +2530,6 @@ class GemmaModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2481,7 +2570,6 @@ class Gemma2Model(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2556,7 +2644,6 @@ class MambaModel(Model): # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading @@ -2676,7 +2763,7 @@ class JinaBertV2Model(BertModel): yield name, data - def set_vocab(self, *args, **kwargs): + def set_vocab(self): tokenizer_class = 'BertTokenizer' with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: tokenizer_class = json.load(f)['tokenizer_class'] @@ -2735,7 +2822,6 @@ class OpenELMModel(Model): assert self.block_count == len(self._num_query_heads) assert self.block_count == len(self._ffn_dims) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_context_length(self.hparams["max_context_length"]) self.gguf_writer.add_embedding_length(n_embd) @@ -2825,7 +2911,7 @@ class ArcticModel(Model): added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] for token_id, token_json in added_tokens_decoder.items(): token_id = int(token_id) - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -2909,8 +2995,8 @@ class ArcticModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -2988,8 +3074,8 @@ class DeepseekV2Model(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -3074,7 +3160,7 @@ class T5Model(Model): added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -3107,7 +3193,6 @@ class T5Model(Model): self.gguf_writer.add_add_eos_token(True) def set_gguf_parameters(self): - self.gguf_writer.add_name("T5") if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: logger.warning("Couldn't find context length in config.json, assuming default value of 512") n_ctx = 512 @@ -3181,7 +3266,6 @@ class JaisModel(Model): self._set_vocab_gpt2() def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -3227,8 +3311,8 @@ class JaisModel(Model): return tensors - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) @@ -3387,7 +3471,6 @@ class ChatGLMModel(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_head_kv = self.hparams.get("multi_query_group_num", n_head) @@ -3539,6 +3622,10 @@ def parse_args() -> argparse.Namespace: "--no-tensor-first-split", action="store_true", help="do not add tensors to the first split (disabled by default)" ) + parser.add_argument( + "--metadata", type=Path, + help="Specify the path for an authorship metadata override file" + ) return parser.parse_args() @@ -3564,7 +3651,10 @@ def split_str_to_n_bytes(split_str: str) -> int: def main() -> None: args = parse_args() - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) dir_model = args.model @@ -3588,34 +3678,30 @@ def main() -> None: if args.outfile is not None: fname_out = args.outfile else: - # output in the same directory as the model by default - fname_out = dir_model / 'ggml-model-{ftype}.gguf' + fname_out = dir_model logger.info(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) with torch.inference_mode(): + output_type = ftype_map[args.outtype] + model_architecture = hparams["architectures"][0] + try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = Model.from_model_architecture(model_architecture) except NotImplementedError: - logger.error(f"Model {hparams['architectures'][0]} is not supported") + logger.error(f"Model {model_architecture} is not supported") sys.exit(1) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, - args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors, + model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out, + is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, + eager=args.no_lazy, + metadata_override=args.metadata, model_name=args.model_name, + split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split) - logger.info("Set model parameters") - model_instance.gguf_writer.add_type(gguf.GGUFType.MODEL) - model_instance.set_gguf_parameters() - - logger.info("Set model tokenizer") - model_instance.set_vocab() - - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index e4165ae2d..d5a2d925e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum): # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome -chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' +CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' if len(sys.argv) == 2: token = sys.argv[1] @@ -91,6 +91,9 @@ models = [ {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, + {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, + {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, + {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, ] @@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path): response = sess.get(url, headers=headers) response.raise_for_status() os.makedirs(os.path.dirname(save_path), exist_ok=True) - with open(save_path, 'wb') as f: - f.write(response.content) + with open(save_path, 'wb') as downloaded_file: + downloaded_file.write(response.content) logger.info(f"File {save_path} downloaded successfully") @@ -159,7 +162,7 @@ for model in models: logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") continue # Skip to the next model if the tokenizer can't be loaded - chktok = tokenizer.encode(chktxt) + chktok = tokenizer.encode(CHK_TXT) chkhsh = sha256(str(chktok).encode()).hexdigest() logger.info(f"model: {name}") @@ -191,7 +194,7 @@ src_func = f""" # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = {repr(chktxt)} + chktxt = {repr(CHK_TXT)} chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -287,7 +290,7 @@ tests = [ "333333333", "Cửa Việt", # llama-bpe fails on this " discards", - chktxt, + CHK_TXT, ] # write the tests to ./models/ggml-vocab-{name}.gguf.inp diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py index 95ea831a5..7b00b4398 100755 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_llama_ggml_to_gguf.py @@ -132,6 +132,10 @@ class Tensor: class GGMLModel: + + file_format: GGMLFormat + format_version: int + def __init__(self): self.hyperparameters = None self.vocab = None @@ -290,7 +294,7 @@ class GGMLToGGUF: if self.vocab_override is not None: vo = self.vocab_override logger.info('* Adding vocab item(s)') - for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): + for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): tokens.append(vbytes) scores.append(score) toktypes.append(ttype) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 4bb939d45..a88d0d4a9 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -251,6 +251,10 @@ def parse_args() -> argparse.Namespace: "--verbose", action="store_true", help="increase output verbosity", ) + parser.add_argument( + "--dry-run", action="store_true", + help="only print out what will be done, without writing any new files", + ) parser.add_argument( "--base", type=Path, required=True, help="directory containing base model file", @@ -286,7 +290,7 @@ if __name__ == '__main__': fname_out = args.outfile else: # output in the same directory as the model by default - fname_out = dir_lora / 'ggml-lora-{ftype}.gguf' + fname_out = dir_lora if os.path.exists(input_model): # lazy import load_file only if lora is in safetensors format. @@ -310,6 +314,23 @@ if __name__ == '__main__': class LoraModel(model_class): model_arch = model_class.model_arch + lora_alpha: float + + def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs): + + super().__init__(*args, **kwargs) + + self.dir_model_card = dir_lora_model + self.lora_alpha = float(lora_alpha) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) + self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + + def set_gguf_parameters(self): + self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) + super().set_gguf_parameters() + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_map: dict[str, PartialLoraTensor] = {} @@ -350,6 +371,11 @@ if __name__ == '__main__': yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + + alpha: float = lparams["lora_alpha"] + model_instance = LoraModel( dir_base_model, ftype, @@ -357,18 +383,11 @@ if __name__ == '__main__': is_big_endian=args.bigendian, use_temp_file=False, eager=args.no_lazy, - model_name=None, + dry_run=args.dry_run, + dir_lora_model=dir_lora, + lora_alpha=alpha, ) - with open(lora_config, "r") as f: - lparams: dict[str, Any] = json.load(f) - - alpha = lparams["lora_alpha"] - - model_instance.gguf_writer.add_string(gguf.Keys.General.TYPE, gguf.GGUFType.ADAPTER) - model_instance.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") - model_instance.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, float(alpha)) - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) logger.info("Exporting model...") model_instance.write() logger.info(f"Model successfully exported to {model_instance.fname_out}") diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 885983e92..d36ac0a15 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow ```sh ./build/bin/llama-ls-sycl-device ``` -A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: +This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` -found 6 SYCL devices: +found 2 SYCL devices: + | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| -| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| -| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| -| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -| Attribute | Note | -|------------------------|-------------------------------------------------------------| -| compute capability 1.3 | Level-zero driver/runtime, recommended | -| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases | 4. Launch inference There are two device selection modes: - Single device: Use one device target specified by the user. -- Multiple devices: Automatically select the devices with the same largest Max compute-units. +- Multiple devices: Automatically choose the devices with the same backend. + +In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. | Device selection | Parameter | |------------------|----------------------------------------| @@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow build\bin\ls-sycl-device.exe ``` -The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following: +This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` -found 6 SYCL devices: +found 2 SYCL devices: | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| -| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| -| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| -| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -| Attribute | Note | -|------------------------|-----------------------------------------------------------| -| compute capability 1.3 | Level-zero running time, recommended | -| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases | - 4. Launch inference There are two device selection modes: -- Single device: Use one device assigned by user. -- Multiple devices: Automatically choose the devices with the same biggest Max compute units. +- Single device: Use one device assigned by user. Default device id is 0. +- Multiple devices: Automatically choose the devices with the same backend. + +In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. | Device selection | Parameter | |------------------|----------------------------------------| diff --git a/docs/build.md b/docs/build.md index 916fcf22d..cfe42ebbf 100644 --- a/docs/build.md +++ b/docs/build.md @@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options. make ``` - - On Windows: + - On Windows (x86/x64 only, arm64 requires cmake): 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 2. Extract `w64devkit` on your pc. @@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options. cmake -B build -G "Xcode" cmake --build build --config Debug ``` + - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: + - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): + - Tab Workload: Desktop-development with C++ + - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang) + - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test + - For Windows on ARM (arm64, WoA) build with: + ```bash + cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF + cmake --build build-arm64-windows-llvm-release + ``` + Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels. - Using `gmake` (FreeBSD): @@ -181,6 +192,19 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | +### MUSA + +- Using `make`: + ```bash + make GGML_MUSA=1 + ``` +- Using `CMake`: + + ```bash + cmake -B build -DGGML_MUSA=ON + cmake --build build --config Release + ``` + ### hipBLAS This provides BLAS acceleration on HIP-supported AMD GPUs. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 155743639..67b3d2774 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,7 +21,6 @@ else() add_subdirectory(embedding) add_subdirectory(eval-callback) add_subdirectory(export-lora) - add_subdirectory(finetune) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -53,5 +52,4 @@ else() add_subdirectory(simple) add_subdirectory(speculative) add_subdirectory(tokenize) - add_subdirectory(train-text-from-scratch) endif() diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 2442e954d..53fbfb0a8 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -31,7 +31,7 @@ int main(int argc, char ** argv) { int n_parallel = params.n_parallel; // total length of the sequences including the prompt - int n_predict = 32; + int n_predict = params.n_predict; // init LLM diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index c2c73e8ad..9ab9ab06e 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -24,7 +24,7 @@ from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional +from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar import numpy as np @@ -346,42 +346,6 @@ class Params: return params -@dataclass -class Metadata: - name: Optional[str] = None - author: Optional[str] = None - version: Optional[str] = None - url: Optional[str] = None - description: Optional[str] = None - license: Optional[str] = None - source_url: Optional[str] = None - source_hf_repo: Optional[str] = None - - @staticmethod - def load(metadata_path: Path) -> Metadata: - if metadata_path is None or not metadata_path.exists(): - return Metadata() - - with open(metadata_path, 'r') as file: - data = json.load(file) - - # Create a new Metadata instance - metadata = Metadata() - - # Assigning values to Metadata attributes if they exist in the JSON file - # This is based on LLM_KV_NAMES mapping in llama.cpp - metadata.name = data.get("general.name") - metadata.author = data.get("general.author") - metadata.version = data.get("general.version") - metadata.url = data.get("general.url") - metadata.description = data.get("general.description") - metadata.license = data.get("general.license") - metadata.source_url = data.get("general.source.url") - metadata.source_hf_repo = data.get("general.source.huggingface.repository") - - return metadata - - # # data loading # TODO: reuse (probably move to gguf.py?) @@ -806,7 +770,7 @@ class OutputFile: def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - def add_meta_model(self, params: Params, metadata: Metadata | None) -> None: + def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None: # Metadata About The Model And Its Provenence name = "LLaMA" if metadata is not None and metadata.name is not None: @@ -824,16 +788,73 @@ class OutputFile: self.gguf.add_author(metadata.author) if metadata.version is not None: self.gguf.add_version(metadata.version) - if metadata.url is not None: - self.gguf.add_url(metadata.url) + if metadata.organization is not None: + self.gguf.add_organization(metadata.organization) + + if metadata.finetune is not None: + self.gguf.add_finetune(metadata.finetune) + if metadata.basename is not None: + self.gguf.add_basename(metadata.basename) + if metadata.description is not None: self.gguf.add_description(metadata.description) + if metadata.quantized_by is not None: + self.gguf.add_quantized_by(metadata.quantized_by) + + if metadata.size_label is not None: + self.gguf.add_size_label(metadata.size_label) + if metadata.license is not None: - self.gguf.add_licence(metadata.license) + self.gguf.add_license(metadata.license) + if metadata.license_name is not None: + self.gguf.add_license_name(metadata.license_name) + if metadata.license_link is not None: + self.gguf.add_license_link(metadata.license_link) + + if metadata.url is not None: + self.gguf.add_url(metadata.url) + if metadata.doi is not None: + self.gguf.add_doi(metadata.doi) + if metadata.uuid is not None: + self.gguf.add_uuid(metadata.uuid) + if metadata.repo_url is not None: + self.gguf.add_repo_url(metadata.repo_url) + if metadata.source_url is not None: self.gguf.add_source_url(metadata.source_url) - if metadata.source_hf_repo is not None: - self.gguf.add_source_hf_repo(metadata.source_hf_repo) + if metadata.source_doi is not None: + self.gguf.add_source_doi(metadata.source_doi) + if metadata.source_uuid is not None: + self.gguf.add_source_uuid(metadata.source_uuid) + if metadata.source_repo_url is not None: + self.gguf.add_source_repo_url(metadata.source_repo_url) + + if metadata.base_models is not None: + self.gguf.add_base_model_count(len(metadata.base_models)) + for key, base_model_entry in enumerate(metadata.base_models): + if "name" in base_model_entry: + self.gguf.add_base_model_name(key, base_model_entry["name"]) + if "author" in base_model_entry: + self.gguf.add_base_model_author(key, base_model_entry["author"]) + if "version" in base_model_entry: + self.gguf.add_base_model_version(key, base_model_entry["version"]) + if "organization" in base_model_entry: + self.gguf.add_base_model_organization(key, base_model_entry["organization"]) + if "url" in base_model_entry: + self.gguf.add_base_model_url(key, base_model_entry["url"]) + if "doi" in base_model_entry: + self.gguf.add_base_model_doi(key, base_model_entry["doi"]) + if "uuid" in base_model_entry: + self.gguf.add_base_model_uuid(key, base_model_entry["uuid"]) + if "repo_url" in base_model_entry: + self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"]) + + if metadata.tags is not None: + self.gguf.add_tags(metadata.tags) + if metadata.languages is not None: + self.gguf.add_languages(metadata.languages) + if metadata.datasets is not None: + self.gguf.add_datasets(metadata.datasets) def add_meta_arch(self, params: Params) -> None: # Metadata About The Neural Architecture Itself @@ -944,7 +965,7 @@ class OutputFile: @staticmethod def write_vocab_only( fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -978,7 +999,7 @@ class OutputFile: fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, - metadata: Metadata | None = None, + metadata: gguf.Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1021,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise ValueError(f"Unexpected combination of types: {name_to_type}") -def model_parameter_count(model: LazyModel) -> int: - total_model_parameters = 0 - for i, (name, lazy_tensor) in enumerate(model.items()): - sum_weights_in_tensor = 1 +def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]: + total_params = 0 + shared_params = 0 + expert_params = 0 + + for name, lazy_tensor in tensors: + # We don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Got A Tensor + sum_weights_in_tensor: int = 1 + + # Tensor Volume for dim in lazy_tensor.shape: sum_weights_in_tensor *= dim - total_model_parameters += sum_weights_in_tensor - return total_model_parameters + if ".experts." in name: + if ".experts.0." in name: + expert_params += sum_weights_in_tensor + else: + shared_params += sum_weights_in_tensor -def model_parameter_count_rounded_notation(model_params_count: int) -> str: - if model_params_count > 1e12 : - # Trillions Of Parameters - scaled_model_params = model_params_count * 1e-12 - scale_suffix = "T" - elif model_params_count > 1e9 : - # Billions Of Parameters - scaled_model_params = model_params_count * 1e-9 - scale_suffix = "B" - elif model_params_count > 1e6 : - # Millions Of Parameters - scaled_model_params = model_params_count * 1e-6 - scale_suffix = "M" - else: - # Thousands Of Parameters - scaled_model_params = model_params_count * 1e-3 - scale_suffix = "K" + total_params += sum_weights_in_tensor - return f"{round(scaled_model_params)}{scale_suffix}" + return total_params, shared_params, expert_params def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: @@ -1231,34 +1249,24 @@ class VocabFactory: return vocab, special_vocab -def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str: - quantization = { +def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str: + name = metadata.name if metadata.name is not None else None + basename = metadata.basename if metadata.basename is not None else None + finetune = metadata.finetune if metadata.finetune is not None else None + version = metadata.version if metadata.version is not None else None + size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0) + + output_type = { GGMLFileType.AllF32: "F32", GGMLFileType.MostlyF16: "F16", GGMLFileType.MostlyQ8_0: "Q8_0", }[file_type] - parameters = model_parameter_count_rounded_notation(model_params_count) - - expert_count = "" - if params.n_experts is not None: - expert_count = f"{params.n_experts}x" - - version = "" - if metadata is not None and metadata.version is not None: - version = f"-{metadata.version}" - - name = "ggml-model" - if metadata is not None and metadata.name is not None: - name = metadata.name - elif params.path_model is not None: - name = params.path_model.name - - return f"{name}{version}-{expert_count}{parameters}-{quantization}" + return gguf.naming_convention(name, basename, finetune, version, size_label, output_type) -def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path: - default_filename = default_convention_outfile(file_type, params, model_params_count, metadata) +def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path: + default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata) ret = model_paths[0].parent / f"{default_filename}.gguf" if ret in model_paths: logger.error( @@ -1297,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file") + parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file") parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name") + parser.add_argument("--model-name", type=str, default=None, help="name of the model") args = parser.parse_args(args_in) @@ -1310,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None: else: logging.basicConfig(level=logging.INFO) - metadata = Metadata.load(args.metadata) + model_name = args.model_name + dir_model = args.model + + metadata = gguf.Metadata.load(args.metadata, dir_model, model_name) if args.get_outfile: - model_plus = load_some_model(args.model) + model_plus = load_some_model(dir_model) params = Params.load(model_plus) - model = convert_model_names(model_plus.model, params, args.skip_unknown) - model_params_count = model_parameter_count(model_plus.model) - ftype = pick_output_type(model, args.outtype) - print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100 + model = convert_model_names(model_plus.model, params, args.skip_unknown) + model_params_count = per_model_weight_count_estimation(model_plus.model.items()) + ftype = pick_output_type(model, args.outtype) + + if (metadata is None or metadata.name is None) and params.path_model is not None: + metadata.name = params.path_model.name + + print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100 return if args.no_vocab and args.vocab_only: raise ValueError("--vocab-only does not make sense with --no-vocab") if args.dump_single: - model_plus = lazy_load_file(args.model) + model_plus = lazy_load_file(dir_model) do_dump_model(model_plus) return if not args.vocab_only: - model_plus = load_some_model(args.model) + model_plus = load_some_model(dir_model) else: - model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) - - model_params_count = model_parameter_count(model_plus.model) - logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})") + model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None) if args.dump: do_dump_model(model_plus) @@ -1368,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None: logger.info(f"params = {params}") model_parent_path = model_plus.paths[0].parent - vocab_path = Path(args.vocab_dir or args.model or model_parent_path) + vocab_path = Path(args.vocab_dir or dir_model or model_parent_path) vocab_factory = VocabFactory(vocab_path) vocab_types = None if args.no_vocab else args.vocab_type.split(",") vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path) @@ -1399,13 +1412,21 @@ def main(args_in: list[str] | None = None) -> None: assert params is not None + if metadata.name is None and params.path_model is not None: + metadata.name = params.path_model.name + + model_params_count = per_model_weight_count_estimation(model_plus.model.items()) + logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})") + logger.info(f"Vocab info: {vocab}") logger.info(f"Special vocab info: {special_vocab}") model = model_plus.model model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata) + outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata) + + metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0) params.ftype = ftype logger.info(f"Writing {outfile}, format {ftype}") diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md index 1e20feb4a..59918ec2b 100644 --- a/examples/deprecation-warning/README.md +++ b/examples/deprecation-warning/README.md @@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names. | server | llama-server | | llama-bench | llama-bench | | embedding | llama-embedding | -| finetune | llama-finetune | | quantize | llama-quantize | | tokenize | llama-tokenize | | export-lora | llama-export-lora | @@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names. | save-load-state | llama-save-load-state | | simple | llama-simple | | speculative | llama-speculative | -| train-text-from-scratch | llama-train-text-from-scratch | | vdot | llama-vdot | | tests/test-c.o | tests/test-c.o | diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index c8a3016a4..37d30ab8c 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else if (type == GGML_TYPE_I8) { v = (float) *(int8_t *) &data[i]; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } printf("%12.4f", v); sum += v; diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 1fb17feec..91c33c34a 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model. usage: llama-export-lora [options] options: - -h, --help show this help message and exit - -m FNAME, --model-base FNAME model path from which to load base model (default '') - -o FNAME, --model-out FNAME path to save exported model (default '') - -l FNAME, --lora FNAME apply LoRA adapter - -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S - -t N, --threads N number of threads to use during computation (default: 4) + -m, --model model path from which to load base model (default '') + --lora FNAME path to LoRA adapter (can be repeated to use multiple adapters) + --lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters) + -t, --threads N number of threads to use during computation (default: 4) + -o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf') ``` For example: @@ -20,7 +19,15 @@ For example: ./bin/llama-export-lora \ -m open-llama-3b-v2-q8_0.gguf \ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ - -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin + --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf ``` -Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters. +Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: + +```bash +./bin/llama-export-lora \ + -m your_base_model.gguf \ + -o your_merged_model.gguf \ + --lora-scaled lora_task_A.gguf 0.5 \ + --lora-scaled lora_task_B.gguf 0.5 +``` diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 16f27aa77..150f7e8d5 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,465 +1,420 @@ - #include "common.h" #include "ggml.h" #include "ggml-alloc.h" +#include #include #include #include +#include -struct lora_info { - std::string filename; +static bool g_verbose = false; + +static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); +} + +static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) { + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +static std::string ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ ctx_ggml, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); + if (!ctx_gguf) { + throw std::runtime_error("failed to load input GGUF from " + fname); + } + return ctx_gguf; +} + +static void replace_all(std::string & s, const std::string & search, const std::string & replace) { + std::string result; + for (size_t pos = 0; ; pos += search.length()) { + auto new_pos = s.find(search, pos); + if (new_pos == std::string::npos) { + result += s.substr(pos, s.size() - pos); + break; + } + result += s.substr(pos, new_pos - pos) + replace; + pos = new_pos; + } + s = std::move(result); +} + +struct file_input { + struct ggml_context * ctx_meta = nullptr; + struct gguf_context * ctx_gguf = nullptr; + std::ifstream f_in; + std::map tensors; + float alpha; float scale; + + file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) { + if (!f_in.is_open()) { + throw std::runtime_error("failed to open input gguf from " + fname); + } + + ctx_gguf = load_gguf(fname, &ctx_meta); + alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha"); + printf("%s: loaded gguf from %s\n", __func__, fname.c_str()); + + for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) { + std::string name(cur->name); + tensors[name] = cur; + if (g_verbose) { + printf("%s: %s\n", __func__, cur->name); + } + } + } + + ggml_tensor * get_tensor(std::string name) { + if (tensors.find(name) == tensors.end()) { + return nullptr; + } + return tensors[name]; + } + + void read_tensor_data(std::string name, std::vector & buf) { + if (tensors.find(name) == tensors.end()) { + throw std::runtime_error("cannot find tensor with name: " + name); + } + auto len = ggml_nbytes(tensors[name]); + if (buf.size() < len) { + buf.resize(len); + } + auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + f_in.seekg(offset); + f_in.read((char* )buf.data(), len); + } + + ~file_input() { + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + } }; -struct export_lora_params { - std::string fn_model_base; - std::string fn_model_out; - std::vector lora; +struct lora_merge_ctx { + // input base model + adapters + file_input base_model; + std::vector> adapters; + + // for computing merged tensor int n_threads; -}; + ggml_backend_t backend = nullptr; + ggml_gallocr_t allocr = nullptr; + std::vector read_buf; -struct lora_data { - struct lora_info info; - std::vector data; - struct ggml_context * ctx; + // output file + struct gguf_context * ctx_out; + struct ggml_context * ctx_out_ggml; + std::ofstream fout; - uint32_t lora_r; - uint32_t lora_alpha; -}; + lora_merge_ctx( + std::string & base_fname, + std::vector> & lora_files, + std::string & outfile, + int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { + fout.exceptions(std::ofstream::failbit); // fail fast on write errors -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; + if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) { + throw std::runtime_error("split model is not yet supported"); + } - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; + for (auto lora_inp : lora_files) { + auto fname = std::get<0>(lora_inp); + auto scale = std::get<1>(lora_inp); + std::unique_ptr adapter(new file_input(fname, scale)); + check_metadata_lora(adapter.get()); + adapters.push_back(std::move(adapter)); + } + + ctx_out = gguf_init_empty(); + struct ggml_init_params params = { + /*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_out_ggml = ggml_init(params); + backend = ggml_backend_cpu_init(); + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + } + + void check_metadata_lora(file_input * adapter) { + auto general_type = get_kv_str(adapter->ctx_gguf, "general.type"); + if (general_type != "adapter") { + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); + } + + auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type"); + if (adapter_type != "lora") { + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); + } + + auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture"); + auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture"); + if (general_arch_base != general_arch_lora) { + throw std::runtime_error("model arch and LoRA arch mismatch"); + } + } + + ggml_type get_out_tensor_type(struct ggml_tensor * t) { + if (t->type == GGML_TYPE_F32) { + return GGML_TYPE_F32; } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); + return GGML_TYPE_F16; } } - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } + void run_merge() { + // prepare metadata + gguf_set_kv(ctx_out, base_model.ctx_gguf); + // output is forced to f16 for now + gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16); - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; + // check if all lora adapters have the same tensors + // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777 + static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once."; + if (adapters.size() > 1) { + for (size_t i = 1; i < adapters.size(); ++i) { + if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) { + throw std::runtime_error(err_no_subset_adapter); + } + for (auto & it : adapters[i]->tensors) { + if (adapters[0]->get_tensor(it.first) == nullptr) { + throw std::runtime_error(err_no_subset_adapter); + } + } + } } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - die_fmt("read error: %s", strerror(errno)); + + // mapping base tensor to out tensor (same shape with base, but different type) + // if out_tensor == nullptr, we only copy it + std::vector> base_to_out_tensors; + for (auto & it : base_model.tensors) { + bool t_a = true; + bool t_b = true; + for (auto & adapter : adapters) { + t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a"); + t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b"); + } + auto base_tensor = it.second; + if (!t_a && !t_b) { + // only copy + struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + ggml_set_name(cpy_tensor, base_tensor->name); + base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr)); + gguf_add_tensor(ctx_out, cpy_tensor); + } else if (t_a && t_b) { + // need merging + struct ggml_tensor * out_tensor = ggml_new_tensor( + ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); + ggml_set_name(out_tensor, base_tensor->name); + base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor)); + gguf_add_tensor(ctx_out, out_tensor); + } else { + throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); + } } - if (ret != 1) { - die("unexpectedly reached end of file"); + + // placeholder for the meta data + { + size_t meta_size = gguf_get_meta_size(ctx_out); + zeros(fout, meta_size); } - } - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; + // process base model tensors + size_t n_merged = 0; + for (auto & it : base_to_out_tensors) { + if (it.second != nullptr) { + merge_tensor(it.first, it.second); + n_merged++; + } else { + copy_tensor(it.first); + } } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - die_fmt("write error: %s", strerror(errno)); + + // write output metadata + { + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.seekp(0); + fout.write((const char *)data.data(), data.size()); } + + printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); + printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size()); } - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); + void copy_tensor(struct ggml_tensor * base) { + printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); + size_t len = ggml_nbytes(base); + base_model.read_tensor_data(base->name, read_buf); + fout.write((char* )read_buf.data(), len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); } - bool eof() { - return tell() >= size; - } + void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) { + std::string name_base(base->name); + std::string name_lora_a = name_base + ".lora_a"; + std::string name_lora_b = name_base + ".lora_b"; - ~llama_file() { - if (fp) { - std::fclose(fp); + printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); + + // context for input tensor + std::vector inp_a(adapters.size()); + std::vector inp_b(adapters.size()); + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + + // alloc tensors + struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne); + for (size_t i = 0; i < adapters.size(); ++i) { + auto t_a = adapters[i]->get_tensor(name_lora_a); + auto t_b = adapters[i]->get_tensor(name_lora_b); + inp_a[i] = ggml_dup_tensor(ctx, t_a); + inp_b[i] = ggml_dup_tensor(ctx, t_b); } + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + // load base tensor to backend buffer + base_model.read_tensor_data(name_base, read_buf); + if (base->type != GGML_TYPE_F32) { + // optionally dequantize it + printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); + auto nels = ggml_nelements(inp_base); + ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type); + std::vector dequant_buf(nels * sizeof(float)); + qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); + } else { + ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); + } + + // load lora tensors to backend buffer + for (size_t i = 0; i < adapters.size(); ++i) { + adapters[i]->read_tensor_data(name_lora_a, read_buf); + ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i])); + adapters[i]->read_tensor_data(name_lora_b, read_buf); + ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i])); + } + + // build graph + struct ggml_cgraph * gf; + { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx0 = ggml_init(params0); + gf = ggml_new_graph(ctx0); + struct ggml_tensor * cur = inp_base; + for (size_t i = 0; i < adapters.size(); ++i) { + struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))); + struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32)); + // scale + const float alpha = adapters[i]->alpha; + const float rank = (float) inp_b[i]->ne[0]; + const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; + delta = ggml_scale(ctx0, delta, scale); + cur = ggml_add(ctx0, delta, cur); + printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); + printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); + } + cur = ggml_cast(ctx0, cur, out->type); + printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type)); + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + } + + // compute + { + ggml_gallocr_alloc_graph(allocr, gf); + ggml_backend_cpu_set_n_threads(backend, n_threads); + ggml_backend_graph_compute(backend, gf); + } + + // write data to output file + { + auto result = gf->nodes[gf->n_nodes - 1]; + size_t len = ggml_nbytes(result); + if (read_buf.size() < len) { + read_buf.resize(len); + } + ggml_backend_tensor_get(result, read_buf.data(), 0, len); + fout.write((char* )read_buf.data(), len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); + } + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + } + + ~lora_merge_ctx() { + ggml_gallocr_free(allocr); + ggml_backend_free(backend); + gguf_free(ctx_out); + ggml_free(ctx_out_ggml); } }; -static struct export_lora_params get_default_export_lora_params() { - struct export_lora_params result; - result.fn_model_base = ""; - result.fn_model_out = ""; - result.n_threads = GGML_DEFAULT_N_THREADS; - return result; -} +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); -static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str()); - fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str()); - fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n"); - fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads); -} - -static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) { - bool invalid_param = false; - std::string arg; - struct export_lora_params default_params = get_default_export_lora_params(); - const std::string arg_prefix = "--"; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (arg == "-m" || arg == "--model-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_base = argv[i]; - } else if (arg == "-o" || arg == "--model-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_out = argv[i]; - } else if (arg == "-l" || arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - struct lora_info lora; - lora.filename = argv[i]; - lora.scale = 1.0f; - params->lora.push_back(lora); - } else if (arg == "-s" || arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - break; - } - struct lora_info lora; - lora.filename = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - lora.scale = std::stof(argv[i]); - params->lora.push_back(lora); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_threads = std::stoi(argv[i]); - if (params->n_threads <= 0) { - params->n_threads = std::thread::hardware_concurrency(); - } - } else if (arg == "-h" || arg == "--help") { - export_lora_print_usage(argc, argv, &default_params); - exit(0); - } else { - fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str()); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - } - - if (params->fn_model_base == default_params.fn_model_base) { - fprintf(stderr, "error: please specify a filename for model-base.\n"); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - if (params->fn_model_out == default_params.fn_model_out) { - fprintf(stderr, "error: please specify a filename for model-out.\n"); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str()); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - return true; -} - -static void free_lora(struct lora_data * lora) { - if (lora->ctx != NULL) { - ggml_free(lora->ctx); - } - delete lora; -} - -static struct lora_data * load_lora(struct lora_info * info) { - struct lora_data * result = new struct lora_data; - result->info = *info; - result->ctx = NULL; - result->lora_r = 1; - result->lora_alpha = 1; - - struct llama_file file(info->filename.c_str(), "rb"); - if (file.fp == NULL) { - fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n", - info->filename.c_str()); - free_lora(result); - return NULL; - } - - struct ggml_init_params params_ggml; - params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE; - params_ggml.mem_buffer = NULL; - params_ggml.no_alloc = true; - result->ctx = ggml_init(params_ggml); - - uint32_t magic = file.read_u32(); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); - } - uint32_t version = file.read_u32(); - if (version != 1) { - die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str()); - } - result->lora_r = file.read_u32(); - result->lora_alpha = file.read_u32(); - // read tensor infos from file - std::vector name_buf; - std::vector tensors; - std::vector tensors_offset; - size_t total_nbytes_pad = 0; - while(!file.eof()) { - int64_t ne[4] = {1,1,1,1}; - uint32_t n_dims = file.read_u32(); - uint32_t namelen = file.read_u32(); - uint32_t type = file.read_u32(); - for (uint32_t k = 0; k < n_dims; ++k) { - ne[k] = (int64_t)file.read_u32(); - } - name_buf.clear(); - name_buf.resize(namelen + 1, '\0'); - file.read_raw(name_buf.data(), namelen); - file.seek((0-file.tell()) & 31, SEEK_CUR); - size_t offset = file.tell(); - struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne); - ggml_set_name(tensor, name_buf.data()); - size_t nbytes = ggml_nbytes(tensor); - size_t nbytes_pad = ggml_nbytes_pad(tensor); - total_nbytes_pad += nbytes_pad; - tensors.push_back(tensor); - tensors_offset.push_back(offset); - file.seek(nbytes, SEEK_CUR); - } - // read tensor data - result->data.resize(total_nbytes_pad); - size_t data_offset = 0; - for (size_t i = 0; i < tensors.size(); ++i) { - struct ggml_tensor * tensor = tensors[i]; - size_t offset = tensors_offset[i]; - size_t nbytes = ggml_nbytes(tensor); - size_t nbytes_pad = ggml_nbytes_pad(tensor); - file.seek(offset, SEEK_SET); - tensor->data = result->data.data() + data_offset; - file.read_raw(tensor->data, nbytes); - data_offset += nbytes_pad; - } - return result; -} - - -static struct ggml_cgraph * build_graph_lora( - struct ggml_context * ctx, - struct ggml_tensor * tensor, - struct ggml_tensor * lora_a, - struct ggml_tensor * lora_b, - float scaling -) { - struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b); - if (scaling != 1.0f) { - ab = ggml_scale(ctx, ab, scaling); - } - struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab); - - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand (gf, res); - return gf; -} - -static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) { - if (lora->ctx == NULL) { - return false; - } - std::string name = ggml_get_name(tensor); - std::string name_a = name + std::string(".loraA"); - std::string name_b = name + std::string(".loraB"); - struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str()); - struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str()); - if (lora_a == NULL || lora_b == NULL) { - return false; - } - - float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r; - - struct ggml_init_params params; - params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; - params.mem_buffer = NULL; - params.no_alloc = true; - struct ggml_context * ctx = NULL; - struct ggml_gallocr * alloc = NULL; - struct ggml_cgraph * gf = NULL; - - ctx = ggml_init(params); - alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - - ggml_gallocr_alloc_graph(alloc, gf); - - struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); - static std::vector data_work; - data_work.resize(cplan.work_size); - cplan.work_data = data_work.data(); - - ggml_graph_compute(gf, &cplan); - - ggml_gallocr_free(alloc); - ggml_free(ctx); - return true; -} - -static void export_lora(struct export_lora_params * params) { - // load all loras - std::vector loras; - for (size_t i = 0; i < params->lora.size(); ++i) { - struct lora_data * lora = load_lora(¶ms->lora[i]); - if (lora != NULL) { - loras.push_back(lora); - } - } - if (loras.size() == 0) { - fprintf(stderr, "warning: no lora adapters will be applied.\n"); - } - - // open input file - struct llama_file fin(params->fn_model_base.c_str(), "rb"); - if (!fin.fp) { - die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str()); - } - - // open base model gguf, read tensors without their data - struct ggml_context * ctx_in; - struct gguf_init_params params_gguf; - params_gguf.no_alloc = true; - params_gguf.ctx = &ctx_in; - struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf); - - // create new gguf - struct gguf_context * gguf_out = gguf_init_empty(); - - // copy meta data from base model: kv and tensors - gguf_set_kv(gguf_out, gguf_in); - int n_tensors = gguf_get_n_tensors(gguf_in); - for (int i=0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(gguf_in, i); - struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name); - gguf_add_tensor(gguf_out, tensor); - } - - // create output file - struct llama_file fout(params->fn_model_out.c_str(), "wb"); - if (!fout.fp) { - die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str()); - } - - // write gguf meta data - std::vector meta; - meta.resize(gguf_get_meta_size(gguf_out)); - gguf_get_meta_data(gguf_out, meta.data()); - fout.write_raw(meta.data(), meta.size()); - - std::vector data; - std::vector padding; - for (int i=0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(gguf_in, i); - struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name); - - // read tensor data - data.resize(ggml_nbytes(tensor)); - tensor->data = data.data(); - size_t offset = gguf_get_tensor_offset(gguf_in, i); - fin.seek(offset + meta.size(), SEEK_SET); - fin.read_raw(data.data(), data.size()); - - // apply all loras - for (size_t k = 0; k < loras.size(); ++k) { - apply_lora(tensor, loras[k], params->n_threads); - } - - // write tensor data + padding - padding.clear(); - padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0); - - GGML_ASSERT(fout.tell() == offset + meta.size()); - // fout.seek(offset + meta.size(), SEEK_SET); - fout.write_raw(data.data(), data.size()); - fout.write_raw(padding.data(), padding.size()); - - if (i % 2 == 0) { - printf("."); - } - } + printf("\nexample usage:\n"); + printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); + printf("\nNOTE: output model is F16\n"); printf("\n"); - - // close gguf - gguf_free(gguf_out); - gguf_free(gguf_in); - - // free loras - for (size_t i = 0; i < loras.size(); ++i) { - free_lora(loras[i]); - } } int main(int argc, char ** argv) { - struct export_lora_params params = get_default_export_lora_params(); + gpt_params params; - if (!export_lora_params_parse(argc, argv, ¶ms)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - export_lora(¶ms); + g_verbose = (params.verbosity == 1); + try { + lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads); + ctx.run_merge(); + } catch (const std::exception & err) { + fprintf(stderr, "%s\n", err.what()); + exit(EXIT_FAILURE); + } + + printf("done, output file is %s\n", params.lora_outfile.c_str()); return 0; } diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt deleted file mode 100644 index 64afe6ddc..000000000 --- a/examples/finetune/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-finetune) -add_executable(${TARGET} finetune.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/finetune/README.md b/examples/finetune/README.md deleted file mode 100644 index 1c27df053..000000000 --- a/examples/finetune/README.md +++ /dev/null @@ -1,90 +0,0 @@ -# finetune - -Basic usage instructions: - -```bash -# get training data -wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt - -# finetune LORA adapter -./bin/llama-finetune \ - --model-base open-llama-3b-v2-q8_0.gguf \ - --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \ - --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \ - --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \ - --train-data "shakespeare.txt" \ - --save-every 10 \ - --threads 6 --adam-iter 30 --batch 4 --ctx 64 \ - --use-checkpointing - -# predict -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin -``` - -**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). -The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output. -So in above example after 10 iterations these files will be written: -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf -- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin -- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin - -After 10 more iterations: -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf -- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin -- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin - -Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter. - -llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`. -These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above. - -In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together. - -For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: - -```bash -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ - --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ - --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin -``` - -You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`. - -For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: - -```bash -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ - --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ - --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ - --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin -``` - -The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values. - -Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime. -If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`. - -The default LORA rank can be specified with `--lora-r N`. -The LORA rank can be configured for each model tensor type separately with these command line options: - -```bash - --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4) - --rank-att-norm N LORA rank for attention norm tensor (default 1) - --rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1) - --rank-out-norm N LORA rank for output norm tensor (default 1) - --rank-tok-embd N LORA rank for token embeddings tensor (default 4) - --rank-out N LORA rank for output tensor (default 4) - --rank-wq N LORA rank for wq tensor (default 4) - --rank-wk N LORA rank for wk tensor (default 4) - --rank-wv N LORA rank for wv tensor (default 4) - --rank-wo N LORA rank for wo tensor (default 4) - --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4) - --rank-ffn_down N LORA rank for ffn_down tensor (default 4) - --rank-ffn_up N LORA rank for ffn_up tensor (default 4) -``` - -The LORA rank of 'norm' tensors should always be 1. - -To see all available options use `llama-finetune --help`. diff --git a/examples/finetune/convert_finetune_checkpoint_to_gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py deleted file mode 100644 index 1b79d6995..000000000 --- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py +++ /dev/null @@ -1,487 +0,0 @@ -#!/usr/bin/env python3 -# finetune checkpoint --> gguf conversion - -import argparse -import gguf -import struct -import numpy as np -from pathlib import Path - -# gguf constants -LLM_KV_OPTIMIZER_TYPE = "optimizer.type" -LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" -LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" -LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" -LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" -LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" -LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" -LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" -LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" -LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" -LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" -LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" -LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" -LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" - -LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" -LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" -LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" - -LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" -LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" -LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" -LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" -LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" -LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" - -LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" -LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" -LLM_KV_TRAINING_TYPE = "training.type" -LLM_KV_TRAINING_FILE_VERSION = "training.file_version" -LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" -LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" -LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" - -LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd" -LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm" -LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output" -LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm" -LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q" -LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k" -LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v" -LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output" -LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm" -LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate" -LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down" -LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up" - -class Tensor: - def __init__(self, dtype='f', ne=None): - if ne is None: - ne = [] - self.dtype = dtype - self.ne = ne - self.nbytes = 0 - if self.dtype == 'f': - if len(self.ne) == 0: - self.nbytes = 0 - else: - self.nbytes = int(np.prod(self.ne)) * 4 - else: - raise ValueError(f"Unhandled data type '{self.dtype}'") - - def load(self, data, offset): - nd = struct.unpack(' 0 else []) - - self.lbfgs_x = Tensor('f', [self.nx]) - self.lbfgs_xp = Tensor('f', [self.nx]) - self.lbfgs_g = Tensor('f', [self.nx]) - self.lbfgs_gp = Tensor('f', [self.nx]) - self.lbfgs_d = Tensor('f', [self.nx]) - self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) - self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) - self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) - - # forgot to save type in version 1: - # guess self.type from number of remaining bytes - size_type_0 = 12 + sum([t.max_storage_size() for t in - [self.adam_m, self.adam_v] - +([self.adam_pf] if (self.past > 0) else [])]) - size_type_1 = 24 + sum([t.max_storage_size() for t in - [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, - self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, - self.lbfgs_lmal, self.lbfgs_lmys, - self.lbfgs_lms, self.lbfgs_lmy] - +([self.lbfgs_pf] if (self.past > 0) else [])]) - # due to alignment padding the size might not by exact - # but the difference in size for both types is significant, - # so we can just use whichever is closest - remaining = len(data) - offset - if abs(remaining - size_type_0) < abs(remaining - size_type_1): - self.type = 0 - else: - self.type = 1 - - if self.type == 0: - offset = self.adam_m.load(data, offset) - offset = self.adam_v.load(data, offset) - offset = self.adam_pf.load(data,offset) - - self.adam_fx_best = struct.unpack(' 0: - self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) - - elif self.type == 1: - gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) - gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) - gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) - gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) - gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) - - self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) - self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) - self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) - self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) - self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) - if self.past > 0: - self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) - self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) - self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) - self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) - self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) - else: - raise ValueError('Unknown optimizer type') - -class LoraParams: - def __init__(self): - pass - - def load(self, data, offset): - self.n_rank_attention_norm = struct.unpack(' -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -struct my_llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; - uint32_t n_embd = 4096; - uint32_t n_ff = 11008; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - - // float f_norm_eps = 1e-5f; // falcon - float f_norm_rms_eps = 1e-5f; // llama - - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; - - uint32_t n_gqa() const { - return n_head/n_head_kv; - } - - uint32_t n_embd_head() const { - return n_embd/n_head; - } - - uint32_t n_embd_gqa() const { - return n_embd/n_gqa(); - } - - bool operator!=(const my_llama_hparams& other) const { - return memcmp(this, &other, sizeof(other)); - } -}; - -struct my_llama_layer { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * ffn_gate; // w1 - struct ggml_tensor * ffn_down; // w2 - struct ggml_tensor * ffn_up; // w3 -}; - -struct my_llama_model { - struct my_llama_hparams hparams; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * output; - - std::vector layers; -}; - -struct my_llama_lora_hparams { - uint32_t lora_r = 1; - uint32_t lora_alpha = 1; - uint32_t n_rank_attention_norm = 1; - uint32_t n_rank_wq = 4; - uint32_t n_rank_wk = 4; - uint32_t n_rank_wv = 4; - uint32_t n_rank_wo = 4; - uint32_t n_rank_ffn_norm = 1; - uint32_t n_rank_ffn_gate = 4; - uint32_t n_rank_ffn_down = 4; - uint32_t n_rank_ffn_up = 4; - uint32_t n_rank_tok_embeddings = 4; - uint32_t n_rank_norm = 1; - uint32_t n_rank_output = 4; - - bool operator!=(const my_llama_lora_hparams& other) const { - return memcmp(this, &other, sizeof(other)); - } -}; - -struct my_llama_lora_layer { - // normalization - struct ggml_tensor * attention_norm_a; - struct ggml_tensor * attention_norm_b; - - // attention - struct ggml_tensor * wq_a; - struct ggml_tensor * wq_b; - struct ggml_tensor * wk_a; - struct ggml_tensor * wk_b; - struct ggml_tensor * wv_a; - struct ggml_tensor * wv_b; - struct ggml_tensor * wo_a; - struct ggml_tensor * wo_b; - - // normalization - struct ggml_tensor * ffn_norm_a; - struct ggml_tensor * ffn_norm_b; - - // ff - struct ggml_tensor * ffn_gate_a; - struct ggml_tensor * ffn_gate_b; - struct ggml_tensor * ffn_down_a; - struct ggml_tensor * ffn_down_b; - struct ggml_tensor * ffn_up_a; - struct ggml_tensor * ffn_up_b; -}; - -struct my_llama_lora { - struct ggml_context * ctx = NULL; - ggml_backend_buffer_t data; - - my_llama_lora_hparams hparams; - - struct ggml_tensor * tok_embeddings_a; - struct ggml_tensor * tok_embeddings_b; - - struct ggml_tensor * norm_a; - struct ggml_tensor * norm_b; - struct ggml_tensor * output_a; - struct ggml_tensor * output_b; - - std::vector layers; -}; - -// gguf constants -static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"; -static const char * LLM_KV_TRAINING_TYPE = "training.type"; - -static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"; -static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"; - -// gguf constants (sync with gguf.py) - -static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; -static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; - -static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; -static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; -static const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; -static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; -static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; -static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV = "%s.attention.head_count_kv"; -static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; -static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; -static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp -static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; - -static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; -static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; -static const char * LLM_TENSOR_OUTPUT = "output"; -static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; -static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; -static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; -static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; -static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; -static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; -static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; -static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; -static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; - -static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab : %u\n", __func__, params->n_vocab); - printf("%s: n_ctx : %u\n", __func__, params->n_ctx); - printf("%s: n_embd : %u\n", __func__, params->n_embd); - printf("%s: n_ff : %u\n", __func__, params->n_ff); - printf("%s: n_head : %u\n", __func__, params->n_head); - printf("%s: n_head_kv : %u\n", __func__, params->n_head_kv); - printf("%s: n_layer : %u\n", __func__, params->n_layer); - printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps); - printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base); - printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale); -} - -static void print_lora_params(struct my_llama_lora_hparams * params) { - printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm); - printf("%s: n_rank_wq : %u\n", __func__, params->n_rank_wq); - printf("%s: n_rank_wk : %u\n", __func__, params->n_rank_wk); - printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); - printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); - printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); - printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate); - printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down); - printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up); - printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); - printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); - printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); -} - -#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -{ \ - const std::string skey(key); \ - const int kid = gguf_find_key(ctx, skey.c_str()); \ - if (kid >= 0) { \ - enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - die_fmt("key not found in model: %s", skey.c_str()); \ - } \ -} - -static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) { - std::string arch; - - GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); - if (expected_arch != NULL) { - if (arch != expected_arch) { - printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch); - } - GGML_ASSERT(arch == expected_arch); - } - - std::vector keybuf; - keybuf.resize(512); - auto kv = [&arch, &keybuf](const char * key) -> const char * { - snprintf(keybuf.data(), keybuf.size(), key, arch.c_str()); - return keybuf.data(); - }; - - GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); - GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); - - // n_head_kv is optional, default to n_head - hparams->n_head_kv = hparams->n_head; - GGUF_GET_KEY(ctx, hparams->n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); - - float rope_freq_scale = 1.0f; - GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); - GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - if (rope_freq_scale != 1.0f) { - hparams->rope_freq_scale = 1.0f / rope_freq_scale; - } -} - -static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) { - auto & hparams = model->hparams; - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - auto tn = [&tn_buf](const char * key) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); - return tn_buf.data(); - }; - auto tni = [&tn_buf](const char * key, int bid) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); - return tn_buf.data(); - }; - - - // get parameters directly from gguf file - { - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ NULL, - }; - struct gguf_context * mctx = gguf_init_from_file(fn_model, params); - - load_model_hparams_gguf(mctx, &hparams, "llama"); - - gguf_free(mctx); - } - hparams.n_vocab = llama_n_vocab(input); - hparams.n_ctx = n_ctx; - - // get tensors from llama_model (possibly mmapped) - model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD)); - model->norm = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM)); - model->output = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT)); - - assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab); - assert_shape_1d(model->norm, hparams.n_embd); - assert_shape_2d(model->output, hparams.n_embd, hparams.n_vocab); - - model->layers.resize(hparams.n_layer); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i)); - layer.wq = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i)); - layer.wk = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i)); - layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); - layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); - layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); - layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); - layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); - layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); - - assert_shape_1d(layer.attention_norm, hparams.n_embd); - assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); - assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa()); - assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); - assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); - assert_shape_1d(layer.ffn_norm, hparams.n_embd); - assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff); - assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd); - assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff); - } -} - -static void set_param_lora(struct my_llama_lora * lora) { - const uint32_t n_layer = lora->layers.size(); - - struct ggml_context* ctx = lora->ctx; - - ggml_set_param(ctx, lora->tok_embeddings_a); - ggml_set_param(ctx, lora->tok_embeddings_b); - ggml_set_param(ctx, lora->norm_a); - ggml_set_param(ctx, lora->norm_b); - ggml_set_param(ctx, lora->output_a); - ggml_set_param(ctx, lora->output_b); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - - ggml_set_param(ctx, layer.attention_norm_a); - ggml_set_param(ctx, layer.attention_norm_b); - ggml_set_param(ctx, layer.wq_a); - ggml_set_param(ctx, layer.wq_b); - ggml_set_param(ctx, layer.wk_a); - ggml_set_param(ctx, layer.wk_b); - ggml_set_param(ctx, layer.wv_a); - ggml_set_param(ctx, layer.wv_b); - ggml_set_param(ctx, layer.wo_a); - ggml_set_param(ctx, layer.wo_b); - ggml_set_param(ctx, layer.ffn_norm_a); - ggml_set_param(ctx, layer.ffn_norm_b); - ggml_set_param(ctx, layer.ffn_gate_a); - ggml_set_param(ctx, layer.ffn_gate_b); - ggml_set_param(ctx, layer.ffn_down_a); - ggml_set_param(ctx, layer.ffn_down_b); - ggml_set_param(ctx, layer.ffn_up_a); - ggml_set_param(ctx, layer.ffn_up_b); - } -} - -static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) { - const auto & lparams = lora->hparams; - - const uint32_t n_embd = model->hparams.n_embd; - const uint32_t n_embd_gqa = model->hparams.n_embd_gqa(); - const uint32_t n_layer = model->hparams.n_layer; - const uint32_t n_vocab = model->hparams.n_vocab; - const uint32_t n_ff = model->hparams.n_ff; - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); - return tn_buf.data(); - }; - auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); - return tn_buf.data(); - }; - - // context for lora tensors without their data - struct ggml_init_params ctx_lora_params; - ctx_lora_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); - ctx_lora_params.mem_buffer = NULL; - ctx_lora_params.no_alloc = true; - - struct ggml_context * ctx = ggml_init(ctx_lora_params); - lora->ctx = ctx; - - lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd); - lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab); - lora->norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd); - lora->norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1); - lora->output_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd); - lora->output_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab); - - ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_a")); - ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_b")); - ggml_set_name(lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a")); - ggml_set_name(lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b")); - ggml_set_name(lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.lora_a")); - ggml_set_name(lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.lora_b")); - - lora->layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - - layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd); - layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1); - - layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); - layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); - layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd); - layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa); - layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd); - layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa); - layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); - layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); - - layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); - layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); - - layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd); - layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff); - layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff); - layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd); - layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd); - layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff); - - ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); - ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.wq_a, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_a", i)); - ggml_set_name(layer.wq_b, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_b", i)); - ggml_set_name(layer.wk_a, tni(LLM_TENSOR_ATTN_K, ".weight.lora_a", i)); - ggml_set_name(layer.wk_b, tni(LLM_TENSOR_ATTN_K, ".weight.lora_b", i)); - ggml_set_name(layer.wv_a, tni(LLM_TENSOR_ATTN_V, ".weight.lora_a", i)); - ggml_set_name(layer.wv_b, tni(LLM_TENSOR_ATTN_V, ".weight.lora_b", i)); - ggml_set_name(layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_a", i)); - ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); - } - - set_param_lora(lora); - - // allocate data for lora tensors - lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); -} - -static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { - const uint32_t n_layer = lora->layers.size(); - - struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - - randomize_tensor_normal(lora->tok_embeddings_a, rnd); - ggml_set_zero(lora->tok_embeddings_b); - randomize_tensor_normal(lora->norm_a, rnd); - ggml_set_zero(lora->norm_b); - randomize_tensor_normal(lora->output_a, rnd); - ggml_set_zero(lora->output_b); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - randomize_tensor_normal(layer.attention_norm_a, rnd); - ggml_set_zero(layer.attention_norm_b); - - randomize_tensor_normal(layer.wq_a, rnd); - ggml_set_zero(layer.wq_b); - randomize_tensor_normal(layer.wk_a, rnd); - ggml_set_zero(layer.wk_b); - randomize_tensor_normal(layer.wv_a, rnd); - ggml_set_zero(layer.wv_b); - randomize_tensor_normal(layer.wo_a, rnd); - ggml_set_zero(layer.wo_b); - - randomize_tensor_normal(layer.ffn_norm_a, rnd); - ggml_set_zero(layer.ffn_norm_b); - - randomize_tensor_normal(layer.ffn_gate_a, rnd); - ggml_set_zero(layer.ffn_gate_b); - randomize_tensor_normal(layer.ffn_down_a, rnd); - ggml_set_zero(layer.ffn_down_b); - randomize_tensor_normal(layer.ffn_up_a, rnd); - ggml_set_zero(layer.ffn_up_b); - } - - free_random_normal_distribution(rnd); -} - -static struct ggml_tensor * llama_build_lora_finetune_graphs( - struct my_llama_model * model, - struct my_llama_lora * lora, - ggml_gallocr_t alloc, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - struct ggml_cgraph * gb_tmp, - struct ggml_tensor * * logits, - struct ggml_tensor * tokens_input, - struct ggml_tensor * targets, - const int n_tokens, - const int n_batch, - const bool enable_flash_attn, - const bool enable_checkpointing, - const bool measure_only) { - - ggml_set_scratch(ctx, { 0, 0, nullptr, }); - const int n_past = 0; - const int N = n_tokens; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_head_kv = hparams.n_head_kv; - const int n_ff = hparams.n_ff; - const int n_rot = hparams.n_embd_head(); - const int n_embd_head = hparams.n_embd_head(); - const int n_embd_gqa = hparams.n_embd_gqa(); - - const float rms_norm_eps = hparams.f_norm_rms_eps; - const float rope_freq_base = hparams.rope_freq_base; - const float rope_freq_scale = hparams.rope_freq_scale; - - GGML_ASSERT((size_t) n_layer == lora->layers.size()); - - auto set_name = [](struct ggml_tensor * t, const char * n) { - ggml_set_name(t, n); - if (t->grad) { - ggml_format_name(t->grad, "%s->grad", n); - } - }; - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - ggml_set_input(KQ_pos); - - // rope has so much parameters that we make a custom function for it - auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] - (struct ggml_tensor * t) -> struct ggml_tensor * { - // not capturing these, to silcence warnings - const int rope_mode = 0; - - return ggml_rope_ext(ctx, - t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, - rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f - ); - }; - - set_name(tokens_input, "tokens_input"); - set_name(targets, "targets"); - - GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); - - auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) { - return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); - } else if (a->type == GGML_TYPE_F32) { - return ggml_add(ctx, a, b); - } else { - die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n", - __func__, ggml_type_name(a->type)); - } - }; - - struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b)); - struct ggml_tensor * norm = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b)); - struct ggml_tensor * output = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b)); - - struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch); - struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch); - - struct ggml_tensor * cur = t01; - - std::vector checkpoints; - if (enable_checkpointing) { - checkpoints.push_back(tokens_input); - checkpoints.push_back(targets); - checkpoints.push_back(t00); - checkpoints.push_back(t01); - } - - const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); - - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - struct my_llama_lora_layer & llayer = lora->layers[il]; - - struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); - struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); - struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); - struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); - struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); - struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); - struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b)); - struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b)); - struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b)); - - struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); - struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); - struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch); - struct ggml_tensor * t05 = ggml_mul_mat (ctx, wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch); - struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t08 = ggml_mul_mat (ctx, wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd_gqa, N*n_batch); - struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch); - struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch); - - struct ggml_tensor * t11; - if (ggml_is_quantized(wv->type)) { - struct ggml_tensor * t11_1 = ggml_mul_mat (ctx, wv, t04); set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch); - struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1); set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa); - t11 = ggml_cont (ctx, t11_2); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); - } else { - t11 = ggml_mul_mat (ctx, t04, wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); - } - - struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv); - struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd_head, N, n_head, n_batch); - struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch); - struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch); - struct ggml_tensor * t16; - if (enable_flash_attn) { - GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported"); - //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); - } else { - struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); - struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); - struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch); - struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch); - t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); - } - struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch); - struct ggml_tensor * t20 = ggml_mul_mat (ctx, wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch); - struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch); - struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); - struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); - struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); - struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); - struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); - struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); - cur = t30; - if (enable_checkpointing) { - checkpoints.push_back(cur); - } - } - struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch); - struct ggml_tensor * t32 = ggml_repeat (ctx, norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch); - struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch); - struct ggml_tensor * t34 = ggml_mul_mat (ctx, output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch); - struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch); - struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1); - - if (enable_checkpointing) { - checkpoints.push_back(t31); - checkpoints.push_back(t32); - checkpoints.push_back(t33); - checkpoints.push_back(t34); - checkpoints.push_back(t35); - checkpoints.push_back(t36); - } - - ggml_build_forward_expand(gf, t36); - - if (enable_checkpointing) { - ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); - } else { - ggml_graph_cpy(gf, gb); - ggml_build_backward_expand(ctx, gf, gb, true); - } - - GGML_ASSERT(alloc != NULL); - - // make sure some tensors are not reallocated by inserting new temporary nodes depending on them - int n_leafs_before = gb->n_leafs; - int n_nodes_before = gb->n_nodes; - - // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); - // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); - GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); - ggml_set_input(t36->grad); - // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); - - // make sure base model tensors data cannot be used in viewable operations - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f)); - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f)); - } - - // allocating checkpoints in one block to reduce memory fragmentation - // note: they will be freed in reverse order - for (unsigned int i = 0; i < checkpoints.size(); ++i) { - if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { - ggml_set_input(checkpoints[i]); - } - } - - if (measure_only) { - ggml_gallocr_reserve(alloc, gb); - } else { - ggml_gallocr_alloc_graph(alloc, gb); - - // set KQ_pos - { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } - } - - // remove the additional nodes and leafs - for (int i = n_leafs_before; i < gb->n_leafs; ++i) { - gb->leafs[i] = NULL; - } - for (int i = n_nodes_before; i < gb->n_nodes; ++i) { - gb->nodes[i] = NULL; - } - gb->n_leafs = n_leafs_before; - gb->n_nodes = n_nodes_before; - - *logits = t35; - return t36; -} - -static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - - std::string arch; - - std::vector keybuf; - keybuf.resize(512); - - GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); - GGML_ASSERT(arch == "llama"); - - uint32_t ftype_u; - GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); - GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); - - struct my_llama_hparams hparams; - load_model_hparams_gguf(fctx, &hparams, arch.c_str()); - - // parameters that define tensor shapes must match - GGML_ASSERT(hparams.n_embd == model->hparams.n_embd); - GGML_ASSERT(hparams.n_ff == model->hparams.n_ff); - GGML_ASSERT(hparams.n_head == model->hparams.n_head); - GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv); - GGML_ASSERT(hparams.n_layer == model->hparams.n_layer); - - GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_output, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); - - init_lora(model, lora); - - copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a)); - copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b)); - copy_tensor_by_name(lora->norm_a, f_ggml_ctx, ggml_get_name(lora->norm_a)); - copy_tensor_by_name(lora->norm_b, f_ggml_ctx, ggml_get_name(lora->norm_b)); - copy_tensor_by_name(lora->output_a, f_ggml_ctx, ggml_get_name(lora->output_a)); - copy_tensor_by_name(lora->output_b, f_ggml_ctx, ggml_get_name(lora->output_b)); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a)); - copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b)); - copy_tensor_by_name(layer.wq_a, f_ggml_ctx, ggml_get_name(layer.wq_a)); - copy_tensor_by_name(layer.wq_b, f_ggml_ctx, ggml_get_name(layer.wq_b)); - copy_tensor_by_name(layer.wk_a, f_ggml_ctx, ggml_get_name(layer.wk_a)); - copy_tensor_by_name(layer.wk_b, f_ggml_ctx, ggml_get_name(layer.wk_b)); - copy_tensor_by_name(layer.wv_a, f_ggml_ctx, ggml_get_name(layer.wv_a)); - copy_tensor_by_name(layer.wv_b, f_ggml_ctx, ggml_get_name(layer.wv_b)); - copy_tensor_by_name(layer.wo_a, f_ggml_ctx, ggml_get_name(layer.wo_a)); - copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); - copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); - copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); - copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a)); - copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b)); - copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a)); - copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b)); - copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a)); - copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b)); - } -} - -static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) { - const char * arch = "llama"; - enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; - - std::vector keybuf; - keybuf.resize(512); - auto kv = [arch, &keybuf](const char * key) -> const char * { - snprintf(keybuf.data(), keybuf.size(), key, arch); - return keybuf.data(); - }; - - gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); - gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); - - gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx); - gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd); - gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff); - gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head); - gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV), model->hparams.n_head_kv); - gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer); - gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_embd_head()); - gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps); - gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base); - gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), model->hparams.rope_freq_scale); - - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, lora->hparams.n_rank_tok_embeddings); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, lora->hparams.n_rank_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT, lora->hparams.n_rank_output); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, lora->hparams.n_rank_attention_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q, lora->hparams.n_rank_wq); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K, lora->hparams.n_rank_wk); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up); - - gguf_add_tensor(fctx, lora->tok_embeddings_a); - gguf_add_tensor(fctx, lora->tok_embeddings_b); - gguf_add_tensor(fctx, lora->norm_a); - gguf_add_tensor(fctx, lora->norm_b); - gguf_add_tensor(fctx, lora->output_a); - gguf_add_tensor(fctx, lora->output_b); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - - gguf_add_tensor(fctx, layer.attention_norm_a); - gguf_add_tensor(fctx, layer.attention_norm_b); - gguf_add_tensor(fctx, layer.wq_a); - gguf_add_tensor(fctx, layer.wq_b); - gguf_add_tensor(fctx, layer.wk_a); - gguf_add_tensor(fctx, layer.wk_b); - gguf_add_tensor(fctx, layer.wv_a); - gguf_add_tensor(fctx, layer.wv_b); - gguf_add_tensor(fctx, layer.wo_a); - gguf_add_tensor(fctx, layer.wo_b); - gguf_add_tensor(fctx, layer.ffn_norm_a); - gguf_add_tensor(fctx, layer.ffn_norm_b); - gguf_add_tensor(fctx, layer.ffn_gate_a); - gguf_add_tensor(fctx, layer.ffn_gate_b); - gguf_add_tensor(fctx, layer.ffn_down_a); - gguf_add_tensor(fctx, layer.ffn_down_b); - gguf_add_tensor(fctx, layer.ffn_up_a); - gguf_add_tensor(fctx, layer.ffn_up_b); - } -} - -static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA; - GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE); - GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA); - - load_train_state_gguf(fctx, f_ggml_ctx, train); - load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora); -} - -static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA); - save_llama_lora_gguf(fctx, model, lora); - save_train_state_gguf(fctx, train); -} - -static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - struct ggml_context * f_ggml_ctx; - struct gguf_init_params params; - params.no_alloc = false; - params.ctx = &f_ggml_ctx; - struct gguf_context * fctx = gguf_init_from_file(filename, params); - if (fctx == NULL) { - return false; - } - - load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train); - - gguf_free(fctx); - return true; -} - -static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - printf("%s: saving to %s\n", __func__, filename); - struct gguf_context * fctx = gguf_init_empty(); - - save_checkpoint_lora_gguf(fctx, model, lora, train); - - // write file - const bool only_meta = false; - gguf_write_to_file(fctx, filename, only_meta); - gguf_free(fctx); -} - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - die_fmt("read error: %s", strerror(errno)); - } - if (ret != 1) { - die("unexpectedly reached end of file"); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - die_fmt("write error: %s", strerror(errno)); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - return; - } - if (name == NULL) { - name = ggml_get_name(tensor); - } - uint32_t name_len = strlen(name); - uint32_t nd = ggml_n_dims(tensor); - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->write_raw(tensor->data, ggml_nbytes(tensor)); -} - -static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) { - printf("%s: saving to %s\n", __func__, filename); - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - - auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); - return tn_buf.data(); - }; - - auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); - return tn_buf.data(); - }; - - // write_magic - file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic - file.write_u32(1); // version - // write_hparams - file.write_u32(lora->hparams.lora_r); - file.write_u32(lora->hparams.lora_alpha); - // write tensors - write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraA")); - write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraB")); - write_tensor(&file, lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA")); - write_tensor(&file, lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB")); - write_tensor(&file, lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.loraA")); - write_tensor(&file, lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.loraB")); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA")); - write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.wq_a, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraA")); - write_tensor(&file, layer.wq_b, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraB")); - write_tensor(&file, layer.wk_a, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraA")); - write_tensor(&file, layer.wk_b, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraB")); - write_tensor(&file, layer.wv_a, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraA")); - write_tensor(&file, layer.wv_b, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraB")); - write_tensor(&file, layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraA")); - write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); - } -} - -struct train_params { - struct train_params_common common; - - const char * fn_model_base; - const char * fn_lora_out; - - bool only_write_lora; - - float f_norm_rms_eps; - float rope_freq_base; - float rope_freq_scale; - - bool custom_f_norm_rms_eps; - bool custom_rope_freq_base; - bool custom_rope_freq_scale; - - int32_t lora_r; - int32_t lora_alpha; - bool custom_lora_alpha; - - uint32_t n_rank_attention_norm; - uint32_t n_rank_wq; - uint32_t n_rank_wk; - uint32_t n_rank_wv; - uint32_t n_rank_wo; - uint32_t n_rank_ffn_norm; - uint32_t n_rank_ffn_gate; - uint32_t n_rank_ffn_down; - uint32_t n_rank_ffn_up; - uint32_t n_rank_tok_embeddings; - uint32_t n_rank_norm; - uint32_t n_rank_output; - - bool custom_n_rank_attention_norm; - bool custom_n_rank_wq; - bool custom_n_rank_wk; - bool custom_n_rank_wv; - bool custom_n_rank_wo; - bool custom_n_rank_ffn_norm; - bool custom_n_rank_ffn_gate; - bool custom_n_rank_ffn_down; - bool custom_n_rank_ffn_up; - bool custom_n_rank_tok_embeddings; - bool custom_n_rank_norm; - bool custom_n_rank_output; -}; - -static struct train_params get_default_train_params() { - struct train_params params; - params.common = get_default_train_params_common(); - params.fn_model_base = ""; - params.fn_lora_out = "ggml-lora-ITERATION-f32.gguf"; - - params.only_write_lora = false; - - params.f_norm_rms_eps = 1e-5f; - params.rope_freq_base = 10000.0f; - params.rope_freq_scale = 1.0f; - - params.custom_f_norm_rms_eps = false; - params.custom_rope_freq_base = false; - params.custom_rope_freq_scale = false; - - params.lora_r = 4; - params.lora_alpha = 4; - params.custom_lora_alpha = false; - - params.n_rank_attention_norm = 1; - params.n_rank_wq = 4; - params.n_rank_wk = 4; - params.n_rank_wv = 4; - params.n_rank_wo = 4; - params.n_rank_ffn_norm = 1; - params.n_rank_ffn_gate = 4; - params.n_rank_ffn_down = 4; - params.n_rank_ffn_up = 4; - params.n_rank_tok_embeddings = 4; - params.n_rank_norm = 1; - params.n_rank_output = 4; - - params.custom_n_rank_attention_norm = false; - params.custom_n_rank_wq = false; - params.custom_n_rank_wk = false; - params.custom_n_rank_wv = false; - params.custom_n_rank_wo = false; - params.custom_n_rank_ffn_norm = false; - params.custom_n_rank_ffn_gate = false; - params.custom_n_rank_ffn_down = false; - params.custom_n_rank_ffn_up = false; - params.custom_n_rank_tok_embeddings = false; - params.custom_n_rank_norm = false; - params.custom_n_rank_output = false; - - return params; -} - -static void train_print_usage(int argc, char ** argv, const struct train_params * params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - - fprintf(stderr, " --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base); - fprintf(stderr, " --lora-out FNAME path to save llama lora (default '%s')\n", params->fn_lora_out); - fprintf(stderr, " --only-write-lora only save llama lora, don't do any training. use this if you only want to convert a checkpoint to a lora adapter.\n"); - fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); - fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); - fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); - fprintf(stderr, " --lora-alpha N LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha); - fprintf(stderr, " --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r); - fprintf(stderr, " --rank-att-norm N LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-ffn-norm N LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-out-norm N LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-tok-embd N LORA rank for token embeddings tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-out N LORA rank for output tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wq N LORA rank for wq tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n"); - - print_common_train_usage(argc, argv, ¶ms->common); -} - -static bool train_params_parse(int argc, char ** argv, struct train_params * params) { - bool invalid_param = false; - std::string arg; - struct train_params default_params = get_default_train_params(); - const std::string arg_prefix = "--"; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (consume_common_train_arg(argc, argv, &i, ¶ms->common, &invalid_param)) { - if (invalid_param) { - break; - } else if (params->common.print_usage) { - train_print_usage(argc, argv, &default_params); - exit(0); - } - } else if (arg == "--model-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_base = argv[i]; - } else if (arg == "--lora-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_lora_out = argv[i]; - } else if (arg == "--only-write-lora") { - params->only_write_lora = true; - } else if (arg == "--norm-rms-eps") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->f_norm_rms_eps = std::stof(argv[i]); - params->custom_f_norm_rms_eps = true; - } else if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->rope_freq_base = std::stof(argv[i]); - params->custom_rope_freq_base = true; - } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->rope_freq_scale = std::stof(argv[i]); - params->custom_rope_freq_scale = true; - } else if (arg == "--lora-alpha") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->lora_alpha = std::stoi(argv[i]); - params->custom_lora_alpha = true; - } else if (arg == "--lora-r") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->lora_r = std::stoi(argv[i]); - } else if (arg == "--rank-att-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_attention_norm = std::stoi(argv[i]); - params->custom_n_rank_attention_norm = true; - } else if (arg == "--rank-ffn-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_norm = std::stoi(argv[i]); - params->custom_n_rank_ffn_norm = true; - } else if (arg == "--rank-out-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_norm = std::stoi(argv[i]); - params->custom_n_rank_norm = true; - } else if (arg == "--rank-tok-embd") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_tok_embeddings = std::stoi(argv[i]); - params->custom_n_rank_tok_embeddings = true; - } else if (arg == "--rank-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_output = std::stoi(argv[i]); - params->custom_n_rank_output = true; - } else if (arg == "--rank-wq") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wq = std::stoi(argv[i]); - params->custom_n_rank_wq = true; - } else if (arg == "--rank-wk") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wk = std::stoi(argv[i]); - params->custom_n_rank_wk = true; - } else if (arg == "--rank-wv") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wv = std::stoi(argv[i]); - params->custom_n_rank_wv = true; - } else if (arg == "--rank-wo") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wo = std::stoi(argv[i]); - params->custom_n_rank_wo = true; - } else if (arg == "--rank-ffn_gate") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_gate = std::stoi(argv[i]); - params->custom_n_rank_ffn_gate = true; - } else if (arg == "--rank-ffn_down") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_down = std::stoi(argv[i]); - params->custom_n_rank_ffn_down = true; - } else if (arg == "--rank-ffn_up") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_up = std::stoi(argv[i]); - params->custom_n_rank_ffn_up = true; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - train_print_usage(argc, argv, &default_params); - exit(1); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - train_print_usage(argc, argv, &default_params); - exit(1); - } - finish_processing_train_args(¶ms->common); - return true; -} - -struct save_train_files_data { - const char * fn_checkpoint_out; - const char * fn_lora_out; - const char * pattern_fn_it; - const char * fn_latest; - struct my_llama_model * model; - struct my_llama_lora * lora; -}; - -static void save_train_files(void * vdata, struct train_state * train) { - struct save_train_files_data * data = (struct save_train_files_data *) vdata; - - int64_t iter = train->opt->iter; - - if (strlen(data->fn_checkpoint_out) > 0) { - save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train); - save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->model, data->lora, train); - } - if (strlen(data->fn_lora_out) > 0) { - save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora); - save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->lora); - } -} - -static int64_t get_parameter_count(struct my_llama_lora* lora) { - int64_t nx = 0; - nx += ggml_nelements(lora->tok_embeddings_a); - nx += ggml_nelements(lora->tok_embeddings_b); - nx += ggml_nelements(lora->norm_a); - nx += ggml_nelements(lora->norm_b); - nx += ggml_nelements(lora->output_a); - nx += ggml_nelements(lora->output_b); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - nx += ggml_nelements(layer.attention_norm_a); - nx += ggml_nelements(layer.attention_norm_b); - nx += ggml_nelements(layer.wq_a); - nx += ggml_nelements(layer.wq_b); - nx += ggml_nelements(layer.wk_a); - nx += ggml_nelements(layer.wk_b); - nx += ggml_nelements(layer.wv_a); - nx += ggml_nelements(layer.wv_b); - nx += ggml_nelements(layer.wo_a); - nx += ggml_nelements(layer.wo_b); - nx += ggml_nelements(layer.ffn_norm_a); - nx += ggml_nelements(layer.ffn_norm_b); - nx += ggml_nelements(layer.ffn_gate_a); - nx += ggml_nelements(layer.ffn_gate_b); - nx += ggml_nelements(layer.ffn_down_a); - nx += ggml_nelements(layer.ffn_down_b); - nx += ggml_nelements(layer.ffn_up_a); - nx += ggml_nelements(layer.ffn_up_b); - } - return nx; -} - -int main(int argc, char ** argv) { - struct train_params params = get_default_train_params(); - - if (!train_params_parse(argc, argv, ¶ms)) { - return 1; - } - - if (params.common.seed == LLAMA_DEFAULT_SEED) { - params.common.seed = time(NULL); - } - printf("%s: seed: %u\n", __func__, params.common.seed); - srand(params.common.seed); - - struct llama_model_params llama_mparams = llama_model_default_params(); - llama_mparams.n_gpu_layers = params.common.n_gpu_layers; - llama_mparams.vocab_only = false; - - printf("%s: model base = '%s'\n", __func__, params.fn_model_base); - struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams); - - struct llama_context_params llama_cparams = llama_context_default_params(); - struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams); - - struct my_llama_model model; - init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx); - - struct my_llama_lora lora; - - struct train_state * train = init_train_state(); - struct ggml_opt_context * opt = train->opt; - - // set params from command line - if (params.custom_f_norm_rms_eps) { - model.hparams.f_norm_rms_eps = params.f_norm_rms_eps; - } - if (params.custom_rope_freq_base) { - model.hparams.rope_freq_base = params.rope_freq_base; - } - if (params.custom_rope_freq_scale) { - model.hparams.rope_freq_scale = params.rope_freq_scale; - } - lora.hparams.lora_r = params.lora_r; - lora.hparams.lora_alpha = params.custom_lora_alpha ? params.lora_alpha : params.lora_r; - uint32_t n_rank_attention_norm = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1; - uint32_t n_rank_wq = params.custom_n_rank_wq ? params.n_rank_wq : params.lora_r; - uint32_t n_rank_wk = params.custom_n_rank_wk ? params.n_rank_wk : params.lora_r; - uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; - uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; - uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; - uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r; - uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r; - uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r; - uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; - uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; - uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; - lora.hparams.n_rank_attention_norm = n_rank_attention_norm; - lora.hparams.n_rank_wq = n_rank_wq; - lora.hparams.n_rank_wk = n_rank_wk; - lora.hparams.n_rank_wv = n_rank_wv; - lora.hparams.n_rank_wo = n_rank_wo; - lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; - lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate; - lora.hparams.n_rank_ffn_down = n_rank_ffn_down; - lora.hparams.n_rank_ffn_up = n_rank_ffn_up; - lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; - lora.hparams.n_rank_norm = n_rank_norm; - lora.hparams.n_rank_output = n_rank_output; - - // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); - opt->params.print_forward_graph = false; - opt->params.print_backward_graph = false; - opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; - opt->params.n_threads = params.common.n_threads; - opt->params.past = params.common.opt_past; - opt->params.delta = params.common.opt_delta; - opt->params.max_no_improvement = params.common.opt_max_no_improvement; - opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation; - opt->params.adam.n_iter = params.common.adam_n_iter; - opt->params.adam.sched = 1.0f; - opt->params.adam.alpha = params.common.adam_alpha; - opt->params.adam.decay = params.common.adam_decay; - opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim; - opt->params.adam.beta1 = params.common.adam_beta1; - opt->params.adam.beta2 = params.common.adam_beta2; - opt->params.adam.gclip = params.common.adam_gclip; - opt->params.adam.eps_f = params.common.adam_eps_f; - - printf("%s: init model\n", __func__); - bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); - - if (existed) { - // overwrite last n_ctx with user provided n_ctx - if (params.common.custom_n_ctx) { - model.hparams.n_ctx = params.common.n_ctx; - } - - const bool opt_param_count_changed = ( - (lora.hparams.n_rank_attention_norm != n_rank_attention_norm) - || (lora.hparams.n_rank_wq != n_rank_wq) - || (lora.hparams.n_rank_wk != n_rank_wk) - || (lora.hparams.n_rank_wv != n_rank_wv) - || (lora.hparams.n_rank_wo != n_rank_wo) - || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) - || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate) - || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down) - || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up) - || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) - || (lora.hparams.n_rank_norm != n_rank_norm) - || (lora.hparams.n_rank_output != n_rank_output) - ); - - const bool opt_past_changed = opt->params.past != params.common.opt_past; - - if (opt_param_count_changed) { - print_lora_params(&lora.hparams); - die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting."); - // need to discard previous optimizer gradient statistics and opt_init with new shapes - // TODO - } - if (opt_past_changed) { - die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting"); - // need to discard previous optimizer past function value statistics and opt_init with new shapes - // TODO - } - } else { // existed == false - init_lora(&model, &lora); - randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); - if (!params.only_write_lora) { - ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora)); - } - } - opt->iter = train->train_its; - - print_params(&model.hparams); - print_lora_params(&lora.hparams); - printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its); - printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); - printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); - printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f)); - - if (params.only_write_lora) { - save_train_files_data save_data; - save_data.fn_checkpoint_out = ""; - save_data.fn_lora_out = params.fn_lora_out; - save_data.pattern_fn_it = params.common.pattern_fn_it; - save_data.fn_latest = params.common.fn_latest; - save_data.model = &model; - save_data.lora = &lora; - - save_train_files(&save_data, train); - - free_train_state(train); - ggml_free(lora.ctx); - llama_free(lctx); - llama_free_model(lmodel); - return 0; - } - - printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); - printf("%s: opt iter %d\n", __func__, opt->iter); - - int n_tokens = model.hparams.n_ctx; - int n_vocab = model.hparams.n_vocab; - int n_batch = params.common.n_batch; - - // context for input tensors without their data - struct ggml_init_params ctx_input_params = { - ggml_tensor_overhead() * 2, // mem_size - NULL, // mem_buffer - true, // no_alloc - }; - struct ggml_context * ctx_input = ggml_init(ctx_input_params); - - // the input tensors - struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); - struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - - // allocate input tensors - // measure required memory for input tensors - ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); - size_t max_input_size = ggml_backend_buffer_get_size(input_data); - printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); - - // context for compute tensors without their data - const size_t estimated_compute_size_wo_data = ( - 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + - (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) - ); - struct ggml_init_params ctx_compute_params = { - estimated_compute_size_wo_data, // mem_size - NULL, // mem_buffer - true, // no_alloc - }; - struct ggml_context * ctx_compute = NULL; - - struct ggml_tensor * loss = NULL; - struct ggml_tensor * logits = NULL; - - struct ggml_cgraph * gf = NULL; - struct ggml_cgraph * gb = NULL; - struct ggml_cgraph * gb_tmp = NULL; - - // measure required memory for compute tensors - size_t best_compute_size = SIZE_MAX; - enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; - // find best evaluation order - for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { - ctx_compute = ggml_init(ctx_compute_params); - ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gf->order = (enum ggml_cgraph_eval_order) order; - gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gb_tmp = params.common.use_checkpointing - ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) - : NULL; - loss = llama_build_lora_finetune_graphs( - &model, &lora, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.common.use_flash, - params.common.use_checkpointing, - true - ); - size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer - if (max_compute_size < best_compute_size) { - best_compute_size = max_compute_size; - best_order = gf->order; - } - ggml_gallocr_free(alloc); - ggml_free(ctx_compute); - } - size_t max_compute_size = best_compute_size; - printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); - printf("%s: evaluation order = %s\n", __func__, - (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : - (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : - "invalid"); - - // allocate compute tensors - ctx_compute = ggml_init(ctx_compute_params); - ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gf->order = best_order; - gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gb_tmp = params.common.use_checkpointing - ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) - : NULL; - loss = llama_build_lora_finetune_graphs( - &model, &lora, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.common.use_flash, - params.common.use_checkpointing, - false - ); - - // tokenize data - std::vector train_tokens; - std::vector train_samples_begin; - std::vector train_samples_size; - printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); - printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str()); - printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false"); - tokenize_file(lctx, - params.common.fn_train_data, - params.common.sample_start, - params.common.include_sample_start, - params.common.overlapping_samples, - n_tokens, - train_tokens, - train_samples_begin, - train_samples_size); - GGML_ASSERT(train_samples_begin.size() == train_samples_size.size()); - - printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size()); - - std::vector token_noccurs; - token_noccurs.resize(model.hparams.n_vocab, 0); - for (unsigned int i = 0; i < train_tokens.size(); ++i) { - ++token_noccurs[train_tokens[i]]; - } - int n_unique_tokens = 0; - for (unsigned int i = 0; i < token_noccurs.size(); ++i) { - if (token_noccurs[i] == 0) continue; - ++n_unique_tokens; - } - printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); - - size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size()); - const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size()); - if (changed_train_data) { - printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__); - } - if (params.common.force_reshuffle) { - printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__); - } - if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) { - train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed); - train->shuffle_sample_count = train_samples_size.size(); - train->shuffle_next_sample = 0; - train->shuffle_samples_hash = shuffle_samples_hash; - } - std::vector train_shuffled_samples_offs; - std::vector train_shuffled_samples_begin; - std::vector train_shuffled_samples_size; - train_shuffled_samples_offs.resize(train_samples_begin.size()); - train_shuffled_samples_begin.resize(train_samples_begin.size()); - train_shuffled_samples_size.resize(train_samples_size.size()); - train->shuffle_rng_state_next = shuffle_samples( - train->shuffle_rng_state_current, - train_shuffled_samples_offs.data(), - train_shuffled_samples_begin.data(), - train_shuffled_samples_size.data(), - train_samples_begin.data(), - train_samples_size.data(), - train_samples_size.size()); - - printf("%s: begin training\n", __func__); - - save_train_files_data save_data; - save_data.fn_checkpoint_out = params.common.fn_checkpoint_out; - save_data.fn_lora_out = params.fn_lora_out; - save_data.pattern_fn_it = params.common.pattern_fn_it; - save_data.fn_latest = params.common.fn_latest; - save_data.model = &model; - save_data.lora = &lora; - - struct train_opt_callback_data opt_cb_data; - opt_cb_data.params = ¶ms.common; - opt_cb_data.train = train; - opt_cb_data.save_cb = &save_train_files; - opt_cb_data.save_data = &save_data; - opt_cb_data.lctx = lctx; - opt_cb_data.last_save_iter = opt->iter; - opt_cb_data.tokens_data = train_tokens.data(); - opt_cb_data.tokens_size = train_tokens.size(); - opt_cb_data.samples_begin = train_samples_begin.data(); - opt_cb_data.samples_size = train_samples_size.data(); - opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data(); - opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data(); - opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data(); - opt_cb_data.samples_count = train_samples_size.size(); - opt_cb_data.tokens_input = tokens_input; - opt_cb_data.target_probs = target_probs; - opt_cb_data.first_iter = opt->iter; - opt_cb_data.first_epoch = train->train_epochs; - opt_cb_data.iter_at_last_epoch = -1; - opt_cb_data.last_time = ggml_time_ms(); - opt_cb_data.millis_per_iter = 0.0; - - // measure required memory for work buffer - size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; - printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); - - // context for work buffer - struct ggml_init_params ctx_work_params = { - max_work_size, // mem_size - NULL, // mem_buffer - false, // no_alloc - }; - struct ggml_context * ctx_work = ggml_init(ctx_work_params); - - int64_t t0 = ggml_time_ms(); - - ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data); - - ggml_free(ctx_work); - ggml_free(ctx_compute); - ggml_free(ctx_input); - ggml_gallocr_free(alloc); - - - int64_t t1 = ggml_time_ms(); - printf("%s: total training time: ", __func__); - print_duration((double) (t1 - t0)); - printf("\n"); - - int new_iters = opt->iter - opt_cb_data.last_save_iter; - if (new_iters > 0) { - train->train_its += new_iters; - train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens; - - save_train_files(&save_data, train); - opt_cb_data.last_save_iter = opt->iter; - } - - ggml_free(opt->ctx); - free_train_state(train); - ggml_free(lora.ctx); - llama_free(lctx); - llama_free_model(lmodel); - return 0; -} diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh deleted file mode 100644 index e3cc7f271..000000000 --- a/examples/finetune/finetune.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -cd `dirname $0` -cd ../.. - -EXE="./llama-finetune" - -if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi -if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi - -# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. -MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing. - -while getopts "dg" opt; do - case $opt in - d) - DEBUGGER="gdb --args" - ;; - g) - EXE="./build/bin/Release/finetune" - GPUARG="--gpu-layers 25" - ;; - esac -done - -$DEBUGGER $EXE \ - --model-base $MODEL \ - $GPUARG \ - --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ - --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ - --lora-out lora-ol3b-shakespeare-ITERATION.bin \ - --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \ - --save-every 10 \ - --threads 10 --adam-iter 30 --batch 4 --ctx 64 \ - --use-checkpointing diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index dd53ba9b1..48a705e15 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st auto decoded = decode_utf8(input_str, {}); const auto & code_points = decoded.first; + const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); + llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); + size_t pos = 0; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - auto prev_stacks = grammar->stacks; - llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); - if (grammar->stacks.empty()) { + const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy + + llama_grammar_accept(rules, prev_stacks, *it, cur_stacks); + + if (cur_stacks.empty()) { error_pos = pos; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; - grammar->stacks = prev_stacks; + cur_stacks = prev_stacks; return false; } ++pos; } - for (const auto & stack : grammar->stacks) { + for (const auto & stack : cur_stacks) { if (stack.empty()) { return true; } diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 575143771..7498f85ef 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) { struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); + if (!ctx) { + fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str()); + return false; + } + printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 29602881a..bb5faec94 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -1,6 +1,6 @@ # llama.cpp/examples/imatrix -Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models. +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models. More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 ## Usage diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 574f5ed9c..6ce1863cf 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } else if (e.values.size() != (size_t)src1->ne[0]*n_as) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); - exit(1); //GGML_ASSERT(false); + exit(1); //GGML_ABORT("fatal error"); } if (m_params.verbosity > 1) { printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); @@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } else if (e.values.size() != (size_t)src1->ne[0]) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ASSERT(false); + exit(1); //GGML_ABORT("fatal error"); } ++e.ncall; if (m_params.verbosity > 1) { diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d641a9f12..521fa8880 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -23,6 +23,10 @@ #include "ggml-cuda.h" #include "ggml-sycl.h" +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + // utils static uint64_t get_time_ns() { using clock = std::chrono::high_resolution_clock; @@ -120,6 +124,17 @@ static std::string get_gpu_info() { id += "/"; } } +#endif +#ifdef GGML_USE_CANN + uint32_t count = ggml_backend_cann_get_device_count(); + for (uint32_t i = 0; i < count; i++) { + char buf[128]; + ggml_backend_cann_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } #endif // TODO: other backends return id; @@ -135,7 +150,7 @@ static const char * output_format_str(output_formats format) { case JSON: return "json"; case MARKDOWN: return "md"; case SQL: return "sql"; - default: GGML_ASSERT(!"invalid output format"); + default: GGML_ABORT("invalid output format"); } } @@ -161,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_LAYER: return "layer"; case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ASSERT(!"invalid split mode"); + default: GGML_ABORT("invalid split mode"); } } @@ -1311,7 +1326,7 @@ static std::unique_ptr create_printer(output_formats format) { case SQL: return std::unique_ptr(new sql_printer()); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } int main(int argc, char ** argv) { diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 92a6b16b1..2aafe2316 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { - return env->NewStringUTF(""); + return nullptr; } auto new_token_chars = llama_token_to_piece(context, new_token_id); diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 2a3f9f758..58c32ca53 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -26,11 +26,12 @@ actor LlamaContext { private var context: OpaquePointer private var batch: llama_batch private var tokens_list: [llama_token] + var is_done: Bool = false /// This variable is used to store temporarily invalid cchars private var temporary_invalid_cchars: [CChar] - var n_len: Int32 = 64 + var n_len: Int32 = 1024 var n_cur: Int32 = 0 var n_decode: Int32 = 0 @@ -160,6 +161,7 @@ actor LlamaContext { if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") + is_done = true let new_token_str = String(cString: temporary_invalid_cchars + [0]) temporary_invalid_cchars.removeAll() return new_token_str diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 2c1e3f61b..b8f6a31d5 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -132,7 +132,7 @@ class LlamaState: ObservableObject { messageLog += "\(text)" Task.detached { - while await llamaContext.n_cur < llamaContext.n_len { + while await !llamaContext.is_done { let result = await llamaContext.completion_loop() await MainActor.run { self.messageLog += "\(result)" diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d6882eec3..7cda5f10c 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -16,6 +16,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -865,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = peg_0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -1001,6 +1005,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: CLIP using Metal backend\n", __func__); #endif +#ifdef GGML_USE_CANN + new_clip->backend = ggml_backend_cann_init(0); + LOG_TEE("%s: CLIP using CANN backend\n", __func__); +#endif + if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 0b171c872..2fe67100e 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -31,7 +31,6 @@ int main(int argc, char ** argv){ // load the model std::tie(model, ctx) = llama_init_from_gpt_params(params); - GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); // tokenize the prompt std::vector inp; @@ -65,7 +64,7 @@ int main(int argc, char ** argv){ } const int n_input = inp.size(); - const int n_ctx = params.n_ctx; + const int n_ctx = llama_n_ctx(ctx); int n_drafted = 0; int n_accept = 0; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 80ecd925d..bb571bac4 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -39,7 +39,6 @@ int main(int argc, char ** argv){ // load the model std::tie(model, ctx) = llama_init_from_gpt_params(params); - GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); // tokenize the prompt std::vector inp; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a0d817b1a..61e960ea2 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vectorchat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); + LOG("formatted: %s\n", formatted.c_str()); return formatted; } diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py old mode 100644 new mode 100755 index 504ed98df..eb000d5cc --- a/examples/pydantic_models_to_grammar_examples.py +++ b/examples/pydantic_models_to_grammar_examples.py @@ -1,8 +1,15 @@ -# Function calling example using pydantic models. +#!/usr/bin/env python3 + +"""Function calling example using pydantic models.""" + from __future__ import annotations +import argparse import datetime import json +import logging +import textwrap +import sys from enum import Enum from typing import Optional, Union @@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation) -# Function to get completion on the llama.cpp server with grammar. -def create_completion(prompt, grammar): +def create_completion(host, prompt, gbnf_grammar): + """Calls the /completion API on llama-server. + + See + https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints + """ + print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}") headers = {"Content-Type": "application/json"} - data = {"prompt": prompt, "grammar": grammar} - - response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data) - data = response.json() - + data = {"prompt": prompt, "grammar": gbnf_grammar} + result = requests.post(f"http://{host}/completion", headers=headers, json=data).json() assert data.get("error") is None, data - - print(data["content"]) - return data["content"] + logging.info("Result: %s", result) + content = result["content"] + print(f" Model: {result['model']}") + print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}") + return content # A function for the agent to send a message to the user. class SendMessageToUser(BaseModel): - """ - Send a message to the User. - """ + """Send a message to the User.""" chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.") message: str = Field(..., description="Message you want to send to the user.") def run(self): - print(self.message) + print(f"SendMessageToUser: {self.message}") + + +def example_rce(host): + """Minimal test case where the LLM call an arbitrary python function.""" + print("- example_rce") + tools = [SendMessageToUser] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + user_message = "What is 42 * 42?" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + tools_map = {tool.__name__:tool for tool in tools} + # This finds "SendMessageToUser": + tool = tools_map.get(json_data["function"]) + if not tool: + print(f"Error: unknown tool {json_data['function']}") + return 1 + tool(**json_data["function_parameters"]).run() + return 0 # Enum for the calculator tool. @@ -46,11 +77,11 @@ class MathOperation(Enum): DIVIDE = "divide" -# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. +# Simple pydantic calculator tool for the agent that can add, subtract, +# multiply, and divide. Docstring and description of fields will be used in +# system prompt. class Calculator(BaseModel): - """ - Perform a math operation on two numbers. - """ + """Perform a math operation on two numbers.""" number_one: Union[int, float] = Field(..., description="First number.") operation: MathOperation = Field(..., description="Math operation to perform.") number_two: Union[int, float] = Field(..., description="Second number.") @@ -68,55 +99,61 @@ class Calculator(BaseModel): raise ValueError("Unknown operation.") -# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM. -# pydantic_model_list is the list of pydanitc models -# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated -# outer_object_content is the name of outer object content. -# model_prefix is the optional prefix for models in the documentation. (Default="Output Model") -# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( - pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function", - outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") +def example_calculator(host): + """Have the LLM ask to get a calculation done. -print(gbnf_grammar) -print(documentation) + Here the grammar gets generated by passing the available function models to + generate_gbnf_grammar_and_documentation function. This also generates a + documentation usable by the LLM. -system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + pydantic_model_list is the list of pydantic models outer_object_name is an + optional name for an outer object around the actual model object. Like a + "function" object with "function_parameters" which contains the actual model + object. If None, no outer object will be generated outer_object_content is + the name of outer object content. -user_message = "What is 42 * 42?" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) -# This should output something like this: -# { -# "function": "calculator", -# "function_parameters": { -# "number_one": 42, -# "operation": "multiply", -# "number_two": 42 -# } -# } -function_dictionary = json.loads(text) -if function_dictionary["function"] == "calculator": - function_parameters = {**function_dictionary["function_parameters"]} - - print(Calculator(**function_parameters).run()) - # This should output: 1764 + model_prefix is the optional prefix for models in the documentation. (Default="Output Model") + fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") + """ + print("- example_calculator") + tools = [SendMessageToUser, Calculator] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + user_message1 = "What is 42 * 42?" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + expected = { + "function": "Calculator", + "function_parameters": { + "number_one": 42, + "operation": "multiply", + "number_two": 42 + } + } + if json_data != expected: + print(" Result is not as expected!") + tools_map = {tool.__name__:tool for tool in tools} + # This finds "Calculator": + tool = tools_map.get(json_data["function"]) + if not tool: + print(f"Error: unknown tool {json_data['function']}") + return 1 + result = tool(**json_data["function_parameters"]).run() + print(f" Call {json_data['function']} gave result {result}") + return 0 -# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text. class Category(Enum): - """ - The category of the book. - """ + """The category of the book.""" Fiction = "Fiction" NonFiction = "Non-Fiction" class Book(BaseModel): - """ - Represents an entry about a book. - """ + """Represents an entry about a book.""" title: str = Field(..., description="Title of the book.") author: str = Field(..., description="Author of the book.") published_year: Optional[int] = Field(..., description="Publishing year of the book.") @@ -125,33 +162,42 @@ class Book(BaseModel): summary: str = Field(..., description="Summary of the book.") -# We need no additional parameters other than our list of pydantic models. -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book]) +def example_struct(host): + """A example structured output based on pydantic models. -system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + The LLM will create an entry for a Book database out of an unstructured + text. We need no additional parameters other than our list of pydantic + models. + """ + print("- example_struct") + tools = [Book] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools) + system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + # In this case, there's no function nor function_parameters. + # Here the result will vary based on the LLM used. + keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"]) + if keys != sorted(json_data.keys()): + print(f"Unexpected result: {sorted(json_data.keys())}") + return 1 + book = Book(**json_data) + print(f" As a Book object: %s" % book) + return 0 -text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) - -json_data = json.loads(text) - -print(Book(**json_data)) -# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition. def get_current_datetime(output_format: Optional[str] = None): - """ - Get the current date and time in the given format. + """Get the current date and time in the given format. + Args: output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S' """ - if output_format is None: - output_format = '%Y-%m-%d %H:%M:%S' - return datetime.datetime.now().strftime(output_format) + return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S") -# Example function to get the weather +# Example function to get the weather. def get_current_weather(location, unit): """Get the current weather in a given location""" if "London" in location: @@ -160,68 +206,107 @@ def get_current_weather(location, unit): return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value}) elif "North Pole" in location: return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value}) - else: - return json.dumps({"location": location, "temperature": "unknown"}) + return json.dumps({"location": location, "temperature": "unknown"}) -# Here is a function definition in OpenAI style -current_weather_tool = { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", +def example_concurrent(host): + """An example for parallel function calling with a Python function, a pydantic + function model and an OpenAI like function definition. + """ + print("- example_concurrent") + # Function definition in OpenAI style. + current_weather_tool = { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + "required": ["location"], }, - "required": ["location"], }, - }, -} + } + # Convert OpenAI function definition into pydantic model. + current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) + # Add the actual function to a pydantic model. + current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) -# Convert OpenAI function definition into pydantic model -current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) -# Add the actual function to a pydantic model -current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) + # Convert normal Python function to a pydantic model. + current_datetime_model = create_dynamic_model_from_function(get_current_datetime) -# Convert normal Python function to a pydantic model -current_datetime_model = create_dynamic_model_from_function(get_current_datetime) - -tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) + system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation + text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + expected = [ + { + "function": "get_current_datetime", + "params": { + "output_format": "%Y-%m-%d %H:%M:%S" + } + }, + { + "function": "get_current_weather", + "params": { + "location": "London", + "unit": "celsius" + } + }, + { + "function": "Calculator", + "params": { + "number_one": 42, + "operation": "multiply", + "number_two": 42 + } + } + ] + res = 0 + if json_data != expected: + print(" Result is not as expected!") + print(" This can happen on highly quantized models") + res = 1 + tools_map = {tool.__name__:tool for tool in tools} + for call in json_data: + tool = tools_map.get(call["function"]) + if not tool: + print(f"Error: unknown tool {call['function']}") + return 1 + result = tool(**call["params"]).run() + print(f" Call {call['function']} returned {result}") + # Should output something like this: + # Call get_current_datetime returned 2024-07-15 09:50:38 + # Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"} + # Call Calculator returned 1764 + return res -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( - pydantic_model_list=tool_list, outer_object_name="function", - outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) - -system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation +def main(): + parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__) + parser.add_argument("--host", default="localhost:8080", help="llama.cpp server") + parser.add_argument("-v", "--verbose", action="store_true", help="enables logging") + args = parser.parse_args() + logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR) + ret = 0 + # Comment out below to only run the example you want. + ret = ret or example_rce(args.host) + ret = ret or example_calculator(args.host) + ret = ret or example_struct(args.host) + ret = ret or example_concurrent(args.host) + return ret -text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) - -json_data = json.loads(text) - -print(json_data) -# Should output something like this: -# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}] - - -for call in json_data: - if call["function"] == "Calculator": - print(Calculator(**call["params"]).run()) - elif call["function"] == "get_current_datetime": - print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] - elif call["function"] == "get_current_weather": - print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] -# Should output something like this: -# 2024-01-14 13:36:06 -# {"location": "London", "temperature": "42", "unit": "celsius"} -# 1764 +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 00c2277ac..d8afdc141 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -47,7 +47,7 @@ int main(int argc, char ** argv) { // save state (rng, logits, embedding and kv_cache) to file { std::vector state_mem(llama_state_get_size(ctx)); - const size_t written = llama_state_get_data(ctx, state_mem.data()); + const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size()); FILE *fp_write = fopen("dump_state.bin", "wb"); fwrite(state_mem.data(), 1, written, fp_write); @@ -99,13 +99,16 @@ int main(int argc, char ** argv) { // load state (rng, logits, embedding and kv_cache) from file { - std::vector state_mem(llama_state_get_size(ctx2)); + std::vector state_mem; FILE * fp_read = fopen("dump_state.bin", "rb"); + fseek(fp_read, 0, SEEK_END); + state_mem.resize(ftell(fp_read)); + fseek(fp_read, 0, SEEK_SET); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); fclose(fp_read); - if (read != llama_state_set_data(ctx2, state_mem.data())) { + if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); llama_free(ctx2); llama_free_model(model); @@ -159,13 +162,16 @@ int main(int argc, char ** argv) { // load state (rng, logits, embedding and kv_cache) from file { - std::vector state_mem(llama_state_get_size(ctx3)); + std::vector state_mem; FILE * fp_read = fopen("dump_state.bin", "rb"); + fseek(fp_read, 0, SEEK_END); + state_mem.resize(ftell(fp_read)); + fseek(fp_read, 0, SEEK_SET); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); fclose(fp_read); - if (read != llama_state_set_data(ctx3, state_mem.data())) { + if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); llama_free(ctx3); llama_free_model(model); @@ -182,7 +188,7 @@ int main(int argc, char ** argv) { { // save kv of seq 0 std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0); + const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); if (ncopy != seq_store.size()) { fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); llama_free(ctx3); @@ -196,7 +202,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 - const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1); + const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); if (nset != seq_store.size()) { fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); llama_free(ctx3); diff --git a/examples/server/README.md b/examples/server/README.md index e477d1501..33a2b95cc 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** - * LLM inference of F16 and quantum models on GPU and CPU + * LLM inference of F16 and quantized models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes * Parallel decoding with multi-user support * Continuous batching @@ -444,7 +444,7 @@ node index.js `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. - `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. + `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index 987b9a3b8..36818f764 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -21,7 +21,7 @@ let generation_settings = null; // export async function* llama(prompt, params = {}, config = {}) { let controller = config.controller; - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; if (!controller) { controller = new AbortController(); @@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => { // Get the model info from the server. This is useful for getting the context window and so on. export const llamaModelInfo = async (config = {}) => { if (!generation_settings) { - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; const props = await fetch(`${api_url}/props`).then(r => r.json()); generation_settings = props.default_generation_settings; } diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index 5513e9121..c87dd8f1e 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -14,10 +14,10 @@