diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index 2a7da586a..61f671465 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1 # Target the CUDA build image ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 5cbd2e7a1..680d1cb92 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 6f19afa9c..2a06f82b7 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile index bff946cbc..8eda63a89 100644 --- a/.devops/llama-cli-cuda.Dockerfile +++ b/.devops/llama-cli-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER # Target the CUDA runtime image ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all @@ -25,7 +25,7 @@ ENV GGML_CUDA=1 RUN make -j$(nproc) llama-cli -FROM ${BASE_CUDA_RUN_CONTAINER} as runtime +FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libgomp1 diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile index bd816f9f5..79dba06a7 100644 --- a/.devops/llama-cli-intel.Dockerfile +++ b/.devops/llama-cli-intel.Dockerfile @@ -1,6 +1,6 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 -FROM intel/oneapi-basekit:$ONEAPI_VERSION as build +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF RUN apt-get update && \ @@ -14,10 +14,12 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ - cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + echo "Building with static libs" && \ + cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ + ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ cmake --build build --config Release --target llama-cli -FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime COPY --from=build /app/build/bin/llama-cli /llama-cli diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile index caa507b08..c3d1ab067 100644 --- a/.devops/llama-cli-rocm.Dockerfile +++ b/.devops/llama-cli-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile index 6155d5881..9b0dad8bf 100644 --- a/.devops/llama-cli-vulkan.Dockerfile +++ b/.devops/llama-cli-vulkan.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=jammy -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build # Install build tools RUN apt update && apt install -y git build-essential cmake wget libgomp1 diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile index 38382bfc9..7f741aa46 100644 --- a/.devops/llama-cli.Dockerfile +++ b/.devops/llama-cli.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y build-essential git @@ -11,7 +11,7 @@ COPY . . RUN make -j$(nproc) llama-cli -FROM ubuntu:$UBUNTU_VERSION as runtime +FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ apt-get install -y libgomp1 diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index d7eaa0925..67328cf1c 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER # Target the CUDA runtime image ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} -FROM ${BASE_CUDA_DEV_CONTAINER} as build +FROM ${BASE_CUDA_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. ARG CUDA_DOCKER_ARCH=all @@ -27,7 +27,7 @@ ENV LLAMA_CURL=1 RUN make -j$(nproc) llama-server -FROM ${BASE_CUDA_RUN_CONTAINER} as runtime +FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile index 8f8fef8c0..f525658dd 100644 --- a/.devops/llama-server-intel.Dockerfile +++ b/.devops/llama-server-intel.Dockerfile @@ -1,6 +1,6 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04 -FROM intel/oneapi-basekit:$ONEAPI_VERSION as build +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build ARG GGML_SYCL_F16=OFF RUN apt-get update && \ @@ -14,10 +14,11 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ fi && \ + echo "Building with dynamic libs" && \ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ cmake --build build --config Release --target llama-server -FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev curl diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile index af96c3325..763b4cd3f 100644 --- a/.devops/llama-server-rocm.Dockerfile +++ b/.devops/llama-server-rocm.Dockerfile @@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6 # Target the CUDA build image ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete -FROM ${BASE_ROCM_DEV_CONTAINER} as build +FROM ${BASE_ROCM_DEV_CONTAINER} AS build # Unless otherwise specified, we make a fat build. # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile index 49062f84b..13a61ffd8 100644 --- a/.devops/llama-server-vulkan.Dockerfile +++ b/.devops/llama-server-vulkan.Dockerfile @@ -1,6 +1,6 @@ ARG UBUNTU_VERSION=jammy -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build # Install build tools RUN apt update && apt install -y git build-essential cmake wget diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index a53a5c999..ff558604e 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -1,9 +1,9 @@ ARG UBUNTU_VERSION=22.04 -FROM ubuntu:$UBUNTU_VERSION as build +FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential git libcurl4-openssl-dev curl + apt-get install -y build-essential git libcurl4-openssl-dev WORKDIR /app @@ -13,10 +13,10 @@ ENV LLAMA_CURL=1 RUN make -j$(nproc) llama-server -FROM ubuntu:$UBUNTU_VERSION as runtime +FROM ubuntu:$UBUNTU_VERSION AS runtime RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 + apt-get install -y libcurl4-openssl-dev libgomp1 curl COPY --from=build /app/llama-server /llama-server diff --git a/.devops/nix/apps.nix b/.devops/nix/apps.nix index 897fce4d3..0ecf19fc5 100644 --- a/.devops/nix/apps.nix +++ b/.devops/nix/apps.nix @@ -10,7 +10,6 @@ "llama-embedding" "llama-server" "llama-quantize" - "llama-train-text-from-scratch" ]; mkApp = name: { type = "app"; diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 911c42ecb..a87423c71 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -126,16 +126,9 @@ let ++ optionals useMetalKit [ MetalKit ]; cudaBuildInputs = with cudaPackages; [ - cuda_cccl.dev # - - # A temporary hack for reducing the closure size, remove once cudaPackages - # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792 - cuda_cudart.dev - cuda_cudart.lib - cuda_cudart.static - libcublas.dev - libcublas.lib - libcublas.static + cuda_cudart + cuda_cccl # + libcublas ]; rocmBuildInputs = with rocmPackages; [ diff --git a/.devops/tools.sh b/.devops/tools.sh index cf0e8f32d..24dcfd350 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then ./llama-quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then ./llama-cli "$@" -elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then - ./llama-finetune "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -36,8 +34,6 @@ else echo " ex: --outtype f16 \"/models/7B/\" " echo " --quantize (-q): Optimize with quantization process ggml" echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" - echo " --finetune (-f): Run finetune command to create a lora finetune of the model" - echo " See documentation for finetune for command-line parameters" echo " --all-in-one (-a): Execute --convert & --quantize" echo " ex: \"/models/\" 7B" echo " --server (-s): Run a model on the server" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1f275603a..b9246659a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -860,6 +860,7 @@ jobs: mkdir build cd build cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON + cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name diff --git a/.gitignore b/.gitignore index 7c7dee0c6..5ae030200 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ build* !docs/build.md /libllama.so /llama-* +/vulkan-shaders-gen android-ndk-* arm_neon.h cmake-build-* @@ -78,7 +79,6 @@ models-mnt !models/ggml-vocab-*.gguf* # Zig - zig-out/ zig-cache/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 67dcf86d4..a31320635 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) llama_option_depr(WARNING LLAMA_RPC GGML_RPC) llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) +llama_option_depr(WARNING LLAMA_CANN GGML_CANN) # # build the library @@ -138,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o # determining _precisely_ which defines are necessary for the llama-config # package. # -get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS) +get_target_property(GGML_DIRECTORY ggml SOURCE_DIR) +get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS) get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS) set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES}) get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9ad516049..a9e000e52 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,13 +1,18 @@ -# Pull requests +# Pull requests (for contributors) -- Always squash-merge the PR before merging -- Use the following format for your final commit: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` - Test your changes: - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library - Execute [the full CI locally on your machine](ci/README.md) before publishing -- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs. - - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience + - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience +- Consider allowing write access to your branch for faster review +- If your PR becomes stale, don't hesitate to ping the maintainers in the comments + +# Pull requests (for collaborators) + +- Squash-merge PRs +- Use the following format for the squashed commit title: ` : (#)`. For example: `utils : fix typo in utils.py (#1234)` +- Optionally, pick a `` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules # Coding guidelines diff --git a/Makefile b/Makefile index bec332ec5..649671ed6 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,6 @@ BUILD_TARGETS = \ llama-embedding \ llama-eval-callback \ llama-export-lora \ - llama-finetune \ llama-gbnf-validator \ llama-gguf \ llama-gguf-hash \ @@ -20,6 +19,7 @@ BUILD_TARGETS = \ llama-imatrix \ llama-infill \ llama-llava-cli \ + llama-minicpmv-cli\ llama-lookahead \ llama-lookup \ llama-lookup-create \ @@ -37,7 +37,6 @@ BUILD_TARGETS = \ llama-simple \ llama-speculative \ llama-tokenize \ - llama-train-text-from-scratch \ llama-vdot \ llama-cvector-generator \ tests/test-c.o @@ -64,13 +63,13 @@ TEST_TARGETS = \ tests/test-tokenizer-1-spm # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned -LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ +LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ - retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm + retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. -LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune +LEGACY_TARGETS_BUILD = main quantize perplexity embedding server # Deprecation aliases ifdef LLAMA_CUBLAS @@ -327,9 +326,9 @@ ifdef LLAMA_DEBUG endif else MK_CPPFLAGS += -DNDEBUG - MK_CFLAGS += -O3 - MK_CXXFLAGS += -O3 - MK_NVCCFLAGS += -O3 + MK_CFLAGS += -O3 -g + MK_CXXFLAGS += -O3 -g + MK_NVCCFLAGS += -O3 -g endif ifdef LLAMA_SANITIZE_THREAD @@ -530,10 +529,21 @@ ifndef GGML_NO_ACCELERATE endif endif # GGML_NO_ACCELERATE +ifdef GGML_MUSA + CC := clang + CXX := clang++ + GGML_CUDA := 1 + MK_CPPFLAGS += -DGGML_USE_MUSA +endif + ifndef GGML_NO_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp + ifdef GGML_MUSA + MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp + MK_LDFLAGS += -L/usr/lib/llvm-10/lib + endif # GGML_MUSA endif # GGML_NO_OPENMP ifdef GGML_OPENBLAS @@ -584,15 +594,27 @@ else endif # GGML_CUDA_FA_ALL_QUANTS ifdef GGML_CUDA - ifneq ('', '$(wildcard /opt/cuda)') - CUDA_PATH ?= /opt/cuda - else - CUDA_PATH ?= /usr/local/cuda - endif + ifdef GGML_MUSA + ifneq ('', '$(wildcard /opt/musa)') + CUDA_PATH ?= /opt/musa + else + CUDA_PATH ?= /usr/local/musa + endif - MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS - MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib - MK_NVCCFLAGS += -use_fast_math + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include + MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64 + MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22 + else + ifneq ('', '$(wildcard /opt/cuda)') + CUDA_PATH ?= /opt/cuda + else + CUDA_PATH ?= /usr/local/cuda + endif + + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS + MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib + MK_NVCCFLAGS += -use_fast_math + endif # GGML_MUSA OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) @@ -602,9 +624,11 @@ ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings endif # LLAMA_FATAL_WARNINGS +ifndef GGML_MUSA ifndef JETSON_EOL_MODULE_DETECT MK_NVCCFLAGS += --forward-unknown-to-host-compiler endif # JETSON_EOL_MODULE_DETECT +endif # GGML_MUSA ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo @@ -617,8 +641,12 @@ endif # GGML_CUDA_DEBUG ifdef GGML_CUDA_NVCC NVCC = $(CCACHE) $(GGML_CUDA_NVCC) else - NVCC = $(CCACHE) nvcc -endif #GGML_CUDA_NVCC + ifdef GGML_MUSA + NVCC = $(CCACHE) mcc + else + NVCC = $(CCACHE) nvcc + endif # GGML_MUSA +endif # GGML_CUDA_NVCC ifdef CUDA_DOCKER_ARCH MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) @@ -689,9 +717,15 @@ define NVCC_COMPILE $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE else + ifdef GGML_MUSA +define NVCC_COMPILE + $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@ +endef # NVCC_COMPILE + else define NVCC_COMPILE $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE + endif # GGML_MUSA endif # JETSON_EOL_MODULE_DETECT ggml/src/ggml-cuda/%.o: \ @@ -795,6 +829,14 @@ ifdef GGML_CUDA_FORCE_DMMV HIPFLAGS += -DGGML_CUDA_FORCE_DMMV endif # GGML_CUDA_FORCE_DMMV +ifdef GGML_CUDA_FORCE_MMQ + HIPFLAGS += -DGGML_CUDA_FORCE_MMQ +endif # GGML_CUDA_FORCE_MMQ + +ifdef GGML_CUDA_FORCE_CUBLAS + HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS +endif # GGML_CUDA_FORCE_CUBLAS + ifdef GGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY @@ -847,15 +889,16 @@ ggml/src/ggml-metal-embed.o: \ ggml/src/ggml-common.h @echo "Embedding Metal library" @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal - $(eval TEMP_ASSEMBLY=$(shell mktemp)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) - @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) - @$(AS) $(TEMP_ASSEMBLY) -o $@ - @rm -f ${TEMP_ASSEMBLY} + $(eval TEMP_ASSEMBLY=$(shell mktemp -d)) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ + @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s + @rmdir ${TEMP_ASSEMBLY} endif endif # GGML_METAL @@ -868,6 +911,9 @@ OBJ_GGML += \ OBJ_LLAMA = \ src/llama.o \ + src/llama-vocab.o \ + src/llama-grammar.o \ + src/llama-sampling.o \ src/unicode.o \ src/unicode-data.o @@ -935,6 +981,7 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef GGML_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') +ifndef GGML_MUSA ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH @@ -944,6 +991,7 @@ endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) +endif # GGML_MUSA endif # GGML_CUDA $(info ) @@ -1047,6 +1095,10 @@ src/unicode-data.o: \ src/llama.o: \ src/llama.cpp \ + src/llama-impl.h \ + src/llama-vocab.h \ + src/llama-grammar.h \ + src/llama-sampling.h \ src/unicode.h \ include/llama.h \ ggml/include/ggml-cuda.h \ @@ -1056,6 +1108,29 @@ src/llama.o: \ ggml/include/ggml-backend.h $(CXX) $(CXXFLAGS) -c $< -o $@ +src/llama-vocab.o: \ + src/llama-vocab.cpp \ + src/llama-vocab.h \ + src/llama-impl.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-grammar.o: \ + src/llama-grammar.cpp \ + src/llama-grammar.h \ + src/llama-impl.h \ + src/llama-vocab.h \ + src/llama-sampling.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +src/llama-sampling.o: \ + src/llama-sampling.cpp \ + src/llama-sampling.h \ + src/llama-impl.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + $(LIB_LLAMA): \ $(OBJ_LLAMA) \ $(LIB_GGML) @@ -1132,6 +1207,7 @@ clean: rm -rvf ggml/*.dll rm -rvf ggml/*.so rm -vrf ggml/src/*.o + rm -rvf ggml/src/llamafile/*.o rm -rvf common/build-info.cpp rm -vrf ggml/src/ggml-metal-embed.metal rm -vrf ggml/src/ggml-cuda/*.o @@ -1258,11 +1334,6 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \ $(OBJ_GGML) $(OBJ_LLAMA) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) @@ -1278,13 +1349,8 @@ llama-baby-llama: examples/baby-llama/baby-llama.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-finetune: examples/finetune/finetune.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-export-lora: examples/export-lora/export-lora.cpp \ - $(OBJ_GGML) common/log.h + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1388,15 +1454,20 @@ libllava.a: examples/llava/llava.cpp \ $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual llama-llava-cli: examples/llava/llava-cli.cpp \ - examples/llava/clip.h \ - examples/llava/clip.cpp \ - examples/llava/llava.h \ examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual - $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual + +llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \ + examples/llava/llava.cpp \ + examples/llava/llava.h \ + examples/llava/clip.cpp \ + examples/llava/clip.h \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift @@ -1431,7 +1502,7 @@ run-benchmark-matmult: llama-benchmark-matmult .PHONY: run-benchmark-matmult swift tests/test-llama-grammar: tests/test-llama-grammar.cpp \ - $(OBJ_GGML) $(OBJ_COMMON) src/unicode.o src/unicode-data.o + $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -1540,56 +1611,45 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \ # Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed. # # Mark legacy binary targets as .PHONY so that they are always checked. -.PHONY: main quantize perplexity embedding server finetune +.PHONY: main quantize perplexity embedding server + +# Define the object file target +examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ # NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate. # Eventually we will want to remove these target from building all the time. -main: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +main: examples/deprecation-warning/deprecation-warning.o + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead." -server: examples/deprecation-warning/deprecation-warning.cpp - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +server: examples/deprecation-warning/deprecation-warning.o + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead." -quantize: examples/deprecation-warning/deprecation-warning.cpp +quantize: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard quantize)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead." @echo " Remove the 'quantize' binary to remove this warning." @echo "#########" endif -perplexity: examples/deprecation-warning/deprecation-warning.cpp +perplexity: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard perplexity)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead." @echo " Remove the 'perplexity' binary to remove this warning." @echo "#########" endif -embedding: examples/deprecation-warning/deprecation-warning.cpp +embedding: examples/deprecation-warning/deprecation-warning.o ifneq (,$(wildcard embedding)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS) @echo "#########" @echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead." @echo " Remove the 'embedding' binary to remove this warning." @echo "#########" endif - -finetune: examples/deprecation-warning/deprecation-warning.cpp -ifneq (,$(wildcard finetune)) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - @echo "#########" - @echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead." - @echo " Remove the 'finetune' binary to remove this warning." - @echo "#########" -endif diff --git a/Package.swift b/Package.swift index d40a48385..1d90b47bf 100644 --- a/Package.swift +++ b/Package.swift @@ -4,6 +4,9 @@ import PackageDescription var sources = [ "src/llama.cpp", + "src/llama-vocab.cpp", + "src/llama-grammar.cpp", + "src/llama-sampling.cpp", "src/unicode.cpp", "src/unicode-data.cpp", "ggml/src/ggml.c", diff --git a/README.md b/README.md index 058919068..1283f6805 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) -[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) +[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) [![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) @@ -95,8 +95,16 @@ Typically finetunes of the base models below are supported as well. - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) +- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) +- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520) +- [x] [Smaug](https://huggingface.co/models?search=Smaug) +- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B) +- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM) +- [x] [Flan T5](https://huggingface.co/models?search=flan-t5) +- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca) - [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) +- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) @@ -138,12 +146,14 @@ Typically finetunes of the base models below are supported as well. Unless otherwise noted these projects are open-source with permissive licensing: +- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) - [iohub/collama](https://github.com/iohub/coLLaMA) - [janhq/jan](https://github.com/janhq/jan) (AGPL) - [nat/openplayground](https://github.com/nat/openplayground) - [Faraday](https://faraday.dev/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary) - [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary) +- [ramalama](https://github.com/containers/ramalama) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) @@ -181,6 +191,9 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp +**Games:** +- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. + ## Demo
@@ -405,6 +418,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md) | [BLAS](./docs/build.md#blas-build) | All | | [BLIS](./docs/backend/BLIS.md) | All | | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [MUSA](./docs/build.md#musa) | Moore Threads GPU | | [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [Vulkan](./docs/build.md#vulkan) | GPU | diff --git a/common/common.cpp b/common/common.cpp index 1e5fc30dd..560e20d08 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -684,21 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 1.0f); - params.use_mmap = false; + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + }); return true; } if (arg == "--lora-scaled") { CHECK_ARG - const char* lora_adapter = argv[i]; + std::string lora_adapter = argv[i]; CHECK_ARG - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); - params.use_mmap = false; + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]), + }); return true; } - if (arg == "--lora-base") { - CHECK_ARG - params.lora_base = argv[i]; + if (arg == "--lora-init-without-apply") { + params.lora_init_without_apply = true; return true; } if (arg == "--control-vector") { @@ -797,6 +800,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cont_batching = true; return true; } + if (arg == "-nocb" || arg == "--no-cont-batching") { + params.cont_batching = false; + return true; + } if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; return true; @@ -1272,6 +1279,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa CHECK_ARG params.out_file = argv[i]; params.cvector_outfile = argv[i]; + params.lora_outfile = argv[i]; return true; } if (arg == "-ofreq" || arg == "--output-frequency") { @@ -1326,6 +1334,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "--no-warmup") { + params.warmup = false; + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1448,6 +1460,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" }); options.push_back({ "server infill", " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); @@ -1538,6 +1551,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); + options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" }); options.push_back({ "multi-modality" }); options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); @@ -1580,9 +1594,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", "advanced option to override model metadata by key. may be specified multiple times.\n" "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); - options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" }); - options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" }); - options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" }); + options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" "note: this argument can be repeated to add multiple control vectors" }); options.push_back({ "*", " --control-vector-scaled FNAME SCALE", @@ -1631,7 +1644,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" }); + options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); @@ -1651,6 +1664,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); + options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"}); #ifndef LOG_DISABLE_LOGS options.push_back({ "logging" }); @@ -1673,6 +1687,13 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); + options.push_back({ "export-lora" }); + options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); + options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); + options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads }); + options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); + printf("usage: %s [options]\n", argv[0]); for (const auto & o : options) { @@ -1756,6 +1777,17 @@ std::string string_get_sortable_timestamp() { return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } +void string_replace_all(std::string & s, const std::string & search, const std::string & replace) { + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); + } +} + void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -2029,8 +2061,8 @@ std::string fs_get_cache_file(const std::string & filename) { // // Model utils // - -std::tuple llama_init_from_gpt_params(gpt_params & params) { +struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + llama_init_result iparams; auto mparams = llama_model_params_from_gpt_params(params); llama_model * model = nullptr; @@ -2045,7 +2077,7 @@ std::tuple llama_init_from_gpt_par if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return std::make_tuple(nullptr, nullptr); + return iparams; } auto cparams = llama_context_params_from_gpt_params(params); @@ -2054,7 +2086,7 @@ std::tuple llama_init_from_gpt_par if (lctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } if (!params.control_vectors.empty()) { @@ -2065,7 +2097,7 @@ std::tuple llama_init_from_gpt_par if (cvec.n_embd == -1) { llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } int err = llama_control_vector_apply(lctx, @@ -2077,26 +2109,26 @@ std::tuple llama_init_from_gpt_par if (err) { llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } } - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); - int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, - ((i > 0) || params.lora_base.empty()) - ? NULL - : params.lora_base.c_str(), - params.n_threads); - if (err != 0) { - fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + // load and optionally apply lora adapters + for (auto & la : params.lora_adapters) { + llama_lora_adapter_container loaded_la; + loaded_la.path = la.path; + loaded_la.scale = la.scale; + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); llama_free_model(model); - return std::make_tuple(nullptr, nullptr); + return iparams; } + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters + } + if (!params.lora_init_without_apply) { + llama_lora_adapters_apply(lctx, iparams.lora_adapters); } if (params.ignore_eos) { @@ -2130,7 +2162,18 @@ std::tuple llama_init_from_gpt_par llama_reset_timings(lctx); } - return std::make_tuple(model, lctx); + iparams.model = model; + iparams.context = lctx; + return iparams; +} + +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { + llama_lora_adapter_clear(ctx); + for (auto & la : lora_adapters) { + if (la.scale != 0.0f) { + llama_lora_adapter_set(ctx, la.adapter, la.scale); + } + } } struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { @@ -2723,7 +2766,7 @@ std::string llama_chat_format_single(const struct llama_model * model, const llama_chat_msg & new_msg, bool add_ass) { std::ostringstream ss; - auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false); + auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false); std::vector chat_new(past_msg); // if the past_msg ends with a newline, we must preserve it in the formatted version if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') { @@ -3155,20 +3198,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) != 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale == 1.0f) { + fprintf(stream, " - %s\n", la.path.c_str()); } - fprintf(stream, " - %s\n", std::get<0>(la).c_str()); } fprintf(stream, "lora_scaled:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) == 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale != 1.0f) { + fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale); } - fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } - fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); + fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false"); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); diff --git a/common/common.h b/common/common.h index 184a53dc0..bbc33a499 100644 --- a/common/common.h +++ b/common/common.h @@ -33,6 +33,15 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +struct llama_lora_adapter_info { + std::string path; + float scale; +}; + +struct llama_lora_adapter_container : llama_lora_adapter_info { + struct llama_lora_adapter * adapter; +}; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -126,9 +135,8 @@ struct gpt_params { std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - // TODO: avoid tuple, use struct - std::vector> lora_adapter; // lora adapter path with user defined scale - std::string lora_base = ""; // base model path for the lora adapter + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -255,6 +263,8 @@ struct gpt_params { std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; bool spm_infill = false; // suffix/prefix/middle pattern for infill + + std::string lora_outfile = "ggml-lora-merged-f16.gguf"; }; void gpt_params_handle_hf_token(gpt_params & params); @@ -276,6 +286,8 @@ std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); +void string_replace_all(std::string & s, const std::string & search, const std::string & replace); + template static std::vector string_split(const std::string & str, char delim) { std::vector values; @@ -307,8 +319,13 @@ std::string fs_get_cache_file(const std::string & filename); // Model utils // -// TODO: avoid tuplue, use struct -std::tuple llama_init_from_gpt_params(gpt_params & params); +struct llama_init_result { + struct llama_model * model = nullptr; + struct llama_context * context = nullptr; + std::vector lora_adapters; +}; + +struct llama_init_result llama_init_from_gpt_params(gpt_params & params); struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); @@ -316,6 +333,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +// clear LoRA adapters from context, then apply new list of adapters +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); + // Batch utils void llama_batch_clear(struct llama_batch & batch); diff --git a/common/ngram-cache.h b/common/ngram-cache.h index e4fa4cbd1..ab4c9b376 100644 --- a/common/ngram-cache.h +++ b/common/ngram-cache.h @@ -37,11 +37,18 @@ struct llama_ngram { } }; +struct llama_token_hash_function { + size_t operator()(const llama_token token) const { + // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ + return token * 11400714819323198485llu; + } +}; + struct llama_ngram_hash_function { size_t operator()(const llama_ngram & ngram) const { - size_t hash = 0; - for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { - hash ^= std::hash{}(ngram.tokens[i]); + size_t hash = llama_token_hash_function{}(ngram.tokens[0]); + for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) { + hash ^= llama_token_hash_function{}(ngram.tokens[i]); } return hash; } diff --git a/common/sampling.cpp b/common/sampling.cpp index 6a483c815..079e40516 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -330,7 +330,7 @@ static llama_token llama_sampling_sample_impl( llama_token_data_array single_token_data_array = { &single_token_data, 1, false }; // Apply grammar constraints to the single token - llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar); + llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array); // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY bool is_valid = single_token_data_array.data[0].logit != -INFINITY; @@ -421,7 +421,7 @@ static llama_token_data_array llama_sampling_prepare_impl( // apply grammar checks before sampling logic if (apply_grammar && ctx_sampling->grammar != NULL) { - llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); + llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p); } return cur_p; @@ -455,6 +455,6 @@ void llama_sampling_accept( ctx_sampling->prev.push_back(id); if (ctx_sampling->grammar != NULL && apply_grammar) { - llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id); + llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id); } } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 42dace219..7136db440 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -48,34 +48,39 @@ class Model: dir_model: Path ftype: gguf.LlamaFileType + fname_out: Path is_big_endian: bool endianess: gguf.GGUFEndian use_temp_file: bool lazy: bool - model_name: str | None part_names: list[str] is_safetensors: bool hparams: dict[str, Any] block_count: int tensor_map: gguf.TensorNameMap tensor_names: set[str] | None - fname_out: Path gguf_writer: gguf.GGUFWriter + model_name: str | None + metadata_override: Path | None + dir_model_card: Path # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, - model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, + use_temp_file: bool = False, eager: bool = False, + metadata_override: Path | None = None, model_name: str | None = None, + split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False): if type(self) is Model: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + self.dir_model = dir_model self.ftype = ftype + self.fname_out = fname_out self.is_big_endian = is_big_endian self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file self.lazy = not eager - self.model_name = model_name self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: @@ -84,6 +89,11 @@ class Model: self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None + self.metadata_override = metadata_override + self.model_name = model_name + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) @@ -93,10 +103,8 @@ class Model: else: logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - ftype_up: str = self.ftype.name.partition("_")[2].upper() - ftype_lw: str = ftype_up.lower() - # allow templating the file name with the output ftype, useful with the "auto" ftype - self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up) + + # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file, split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @@ -148,9 +156,16 @@ class Model: tensor_names_from_parts.update(model_part.keys()) for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] - if self.lazy: - data = LazyTorchTensor.from_eager(data) + if self.is_safetensors: + if self.lazy: + data = model_part.get_slice(name) + data = LazyTorchTensor.from_safetensors_slice(data) + else: + data = model_part.get_tensor(name) + else: + data = model_part[name] + if self.lazy: + data = LazyTorchTensor.from_eager(data) yield name, data # only verify tensor name presence; it doesn't matter if they are not in the right files @@ -186,7 +201,6 @@ class Model: return new_name def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: @@ -225,6 +239,10 @@ class Model: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + self.gguf_writer.add_file_type(self.ftype) logger.info(f"gguf: file type = {self.ftype}") @@ -233,17 +251,12 @@ class Model: return [(self.map_tensor_name(name), data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid, n_dims # unused return False - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del name, new_name, bid, n_dims # unused - - return False - - def write_tensors(self): + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") for name, data_torch in self.get_tensors(): @@ -267,55 +280,47 @@ class Model: for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): data: np.ndarray # type hint n_dims = len(data.shape) - data_dtype = data.dtype - data_qtype: gguf.GGMLQuantizationType | None = None - - # when both are True, f32 should win - extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) - extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) + data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - extra_f32 = any(cond for cond in ( - extra_f32, - n_dims == 1, - new_name.endswith("_norm.weight"), - )) - - # Some tensor types are always in float32 - extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - )) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - extra_f16 = any(cond for cond in ( - extra_f16, - (name.endswith(".weight") and n_dims >= 2), - )) - - if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32: - if self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data = gguf.quantize_bf16(data) - assert data.dtype == np.int16 - data_qtype = gguf.GGMLQuantizationType.BF16 - - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data): - data = gguf.quantize_q8_0(data) - assert data.dtype == np.uint8 - data_qtype = gguf.GGMLQuantizationType.Q8_0 - - else: # default to float16 for quantized tensors - if data_dtype != np.float16: - data = data.astype(np.float16) - data_qtype = gguf.GGMLQuantizationType.F16 - - if data_qtype is None: # by default, convert to float32 - if data_dtype != np.float32: - data = data.astype(np.float32) + if n_dims <= 1 or new_name.endswith("_norm.weight"): data_qtype = gguf.GGMLQuantizationType.F32 + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + # Some tensor types are always in float32 + if data_qtype is False and ( + any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + ) + ) + or not name.endswith(".weight") + ): + data_qtype = gguf.GGMLQuantizationType.F32 + + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) + if isinstance(data_qtype, bool): + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unknown file type: {self.ftype.name}") + + try: + data = gguf.quants.quantize(data, data_qtype) + except gguf.QuantError as e: + logger.warning("%s, %s", e, "falling back to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape # reverse shape to make it similar to the internal ggml dimension order @@ -326,9 +331,62 @@ class Model: self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def prepare_metadata(self, vocab_only: bool): + + total_params, shared_params, expert_params, expert_count = self.gguf_writer.get_total_parameter_count() + + self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params) + + # Fallback to model directory name if metadata name is still missing + if self.metadata.name is None: + self.metadata.name = self.dir_model.name + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + if not vocab_only: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None) + else: + fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab") + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model tokenizer") + self.set_vocab() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + def write(self): - self.write_tensors() - self.gguf_writer.write_header_to_file(self.fname_out) + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() @@ -336,7 +394,9 @@ class Model: def write_vocab(self): if len(self.gguf_writer.tensors) != 1: raise ValueError('Splitting the vocabulary is not supported') - self.gguf_writer.write_header_to_file(self.fname_out) + + self.prepare_metadata(vocab_only=True) + self.gguf_writer.write_header_to_file(path=self.fname_out) self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() @@ -521,6 +581,15 @@ class Model: if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": # ref: https://huggingface.co/core42/jais-13b res = "jais" + if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": + # ref: https://huggingface.co/WisdomShell/CodeShell-7B + res = "codeshell" + if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": + # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 + res = "tekken" + if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": + # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M + res = "smollm" if res is None: logger.warning("\n") @@ -661,7 +730,7 @@ class Model: added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -678,7 +747,8 @@ class Model: token_id = int(token_id) token: str = token_data["content"] if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token.encode("utf-8") + if tokens[token_id] != token.encode("utf-8"): + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: @@ -773,7 +843,6 @@ class GPTNeoXModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -829,7 +898,6 @@ class BloomModel(Model): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) @@ -906,7 +974,6 @@ class MPTModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_block_count(block_count) @@ -945,7 +1012,6 @@ class OrionModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -958,8 +1024,6 @@ class OrionModel(Model): raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -983,7 +1047,6 @@ class BaichuanModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -995,8 +1058,6 @@ class BaichuanModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1110,7 +1171,6 @@ class XverseModel(Model): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -1122,8 +1182,6 @@ class XverseModel(Model): else: raise ValueError("gguf: can not find ctx length parameter.") - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) - self.gguf_writer.add_source_hf_repo(hf_repo) self.gguf_writer.add_tensor_data_layout("Meta AI original pth") self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1182,7 +1240,6 @@ class FalconModel(Model): if n_head_kv is None: n_head_kv = self.hparams.get("n_head_kv", 1) # old name - self.gguf_writer.add_name("Falcon") self.gguf_writer.add_context_length(2048) # not in config.json self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1227,7 +1284,6 @@ class StarCoderModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("StarCoder") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -1251,6 +1307,7 @@ class RefactModel(Model): special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -1262,7 +1319,6 @@ class RefactModel(Model): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("Refact") # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1317,7 +1373,6 @@ class StableLMModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1379,8 +1434,8 @@ class StableLMModel(Model): return [(new_name, data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._q_norms is not None or self._k_norms is not None: # flatten two `list[dict[str, Tensor]]` into a single `list[str]` @@ -1423,7 +1478,12 @@ class LlamaModel(Model): super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_vocab_size(hparams["vocab_size"]) - self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": @@ -1496,8 +1556,36 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) + + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -1560,7 +1648,6 @@ class GrokModel(Model): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_name("Grok") _experts: list[dict[str, Tensor]] | None = None @@ -1609,7 +1696,6 @@ class DbrxModel(Model): def set_gguf_parameters(self): ffn_config = self.hparams["ffn_config"] attn_config = self.hparams["attn_config"] - self.gguf_writer.add_name(self.hparams["model_type"]) self.gguf_writer.add_block_count(self.hparams["n_layers"]) self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) @@ -1666,7 +1752,7 @@ class DbrxModel(Model): return [(new_name, data_torch)] - def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid # unused return n_dims > 1 @@ -1678,7 +1764,6 @@ class MiniCPMModel(Model): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_name("MiniCPM") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1748,7 +1833,6 @@ class QwenModel(Model): self._set_vocab_qwen() def set_gguf_parameters(self): - self.gguf_writer.add_name("Qwen") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -1824,8 +1908,8 @@ class Qwen2MoeModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -1839,7 +1923,6 @@ class GPT2Model(Model): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_ctx"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1882,7 +1965,6 @@ class Phi2Model(Model): n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_name("Phi2") self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_embedding_length(n_embd) @@ -1944,7 +2026,7 @@ class Phi3MiniModel(Model): for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -1961,7 +2043,8 @@ class Phi3MiniModel(Model): token_id = int(token_id) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -1977,7 +2060,8 @@ class Phi3MiniModel(Model): token_id = int(foken_data["id"]) token = foken_data["content"].encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert tokens[token_id] == token + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2004,7 +2088,6 @@ class Phi3MiniModel(Model): orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) rope_dims = n_embd // n_head - self.gguf_writer.add_name("Phi3") self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) self.gguf_writer.add_embedding_length(n_embd) @@ -2016,10 +2099,11 @@ class Phi3MiniModel(Model): self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(['rope_scaling'], True) - if (rope_scaling is None): + if rope_scaling is None: return scale = max_pos_embds / orig_max_pos_embds @@ -2061,7 +2145,6 @@ class PlamoModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name("PLaMo") self.gguf_writer.add_context_length(4096) # not in config.json self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) @@ -2106,7 +2189,6 @@ class CodeShellModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] - self.gguf_writer.add_name("CodeShell") self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) @@ -2219,7 +2301,8 @@ class InternLM2Model(Model): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2238,7 +2321,8 @@ class InternLM2Model(Model): chat_eos_token_id = token_id token = token.encode("utf-8") if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: - assert(tokens[token_id] == token) + if tokens[token_id] != token: + logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}') tokens[token_id] = token scores[token_id] = -1000.0 toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED @@ -2264,15 +2348,7 @@ class InternLM2Model(Model): special_vocab.add_to_gguf(self.gguf_writer) - def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - def set_gguf_parameters(self): - self.gguf_writer.add_name("InternLM2") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) @@ -2290,26 +2366,22 @@ class InternLM2Model(Model): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] num_kv_heads = self.hparams["num_key_value_heads"] - hidden_size = self.hparams["hidden_size"] + n_embd = self.hparams["hidden_size"] q_per_kv = num_heads // num_kv_heads - head_dim = hidden_size // num_heads + head_dim = n_embd // num_heads num_groups = num_heads // q_per_kv - qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" - - if re.match(qkv_pattern, name): - bid = re.findall(qkv_pattern, name)[0] + if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: qkv = data_torch - # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim)) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + + qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) + q, k, v = qkv[:, : q_per_kv], qkv[:, -2], qkv[:, -1] + # The model weights of q and k equire additional reshape. - # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads) - # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) - k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads) - # v = rearrange(v, " o g n i -> o (g n i)").T - v = v.reshape((v.shape[0], -1)).T + q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) + k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) + v = v.reshape((-1, v.shape[-1])) + return [ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), @@ -2421,6 +2493,112 @@ class NomicBertModel(BertModel): self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) +@Model.register("XLMRobertaModel") +class XLMRobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 + os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'sentencepiece.bpe.model' + if not tokenizer_path.is_file(): + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue] + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM + + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces + precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + # realign tokens (see HF tokenizer code) + tokens = [b'', b'', b'', b''] + tokens[3:-1] + scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] + toktypes = [ + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.CONTROL, + SentencePieceTokenTypes.UNKNOWN, + ] + toktypes[3:-1] + + self.gguf_writer.add_tokenizer_model("t5") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + self.gguf_writer.add_token_type_count(1) + self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) + if precompiled_charsmap: + self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + + @Model.register("GemmaForCausalLM") class GemmaModel(Model): model_arch = gguf.MODEL_ARCH.GEMMA @@ -2436,6 +2614,7 @@ class GemmaModel(Model): special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("eot", 107) + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) @@ -2444,7 +2623,6 @@ class GemmaModel(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2485,7 +2663,6 @@ class Gemma2Model(Model): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -2560,7 +2737,6 @@ class MambaModel(Model): # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading @@ -2597,18 +2773,22 @@ class MambaModel(Model): return [(new_name, data_torch)] - def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: - del n_dims # unused - - return bid is not None and new_name in ( - self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ + def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: + if bid is not None and new_name in ( + self.format_tensor_name( + n, bid, ".weight" if name.endswith(".weight") else "" + ) + for n in [ gguf.MODEL_TENSOR.SSM_CONV1D, gguf.MODEL_TENSOR.SSM_X, gguf.MODEL_TENSOR.SSM_DT, gguf.MODEL_TENSOR.SSM_A, gguf.MODEL_TENSOR.SSM_D, ] - ) + ): + return gguf.GGMLQuantizationType.F32 + + return super().tensor_force_quant(name, new_name, bid, n_dims) @Model.register("CohereForCausalLM") @@ -2680,7 +2860,7 @@ class JinaBertV2Model(BertModel): yield name, data - def set_vocab(self, *args, **kwargs): + def set_vocab(self): tokenizer_class = 'BertTokenizer' with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: tokenizer_class = json.load(f)['tokenizer_class'] @@ -2739,7 +2919,6 @@ class OpenELMModel(Model): assert self.block_count == len(self._num_query_heads) assert self.block_count == len(self._ffn_dims) - self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_context_length(self.hparams["max_context_length"]) self.gguf_writer.add_embedding_length(n_embd) @@ -2829,7 +3008,7 @@ class ArcticModel(Model): added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] for token_id, token_json in added_tokens_decoder.items(): token_id = int(token_id) - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -2913,8 +3092,8 @@ class ArcticModel(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -2992,8 +3171,8 @@ class DeepseekV2Model(Model): return [(self.map_tensor_name(name), data_torch)] - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` @@ -3078,7 +3257,7 @@ class T5Model(Model): added_tokens_json = json.load(f) for key in added_tokens_json: token_id = added_tokens_json[key] - if (token_id >= vocab_size): + if token_id >= vocab_size: logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') continue @@ -3111,7 +3290,6 @@ class T5Model(Model): self.gguf_writer.add_add_eos_token(True) def set_gguf_parameters(self): - self.gguf_writer.add_name("T5") if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None: logger.warning("Couldn't find context length in config.json, assuming default value of 512") n_ctx = 512 @@ -3185,7 +3363,6 @@ class JaisModel(Model): self._set_vocab_gpt2() def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -3231,8 +3408,8 @@ class JaisModel(Model): return tensors - def write_tensors(self): - super().write_tensors() + def prepare_tensors(self): + super().prepare_tensors() self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) @@ -3391,7 +3568,6 @@ class ChatGLMModel(Model): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_head_kv = self.hparams.get("multi_query_group_num", n_head) @@ -3435,19 +3611,46 @@ class LazyTorchTensor(gguf.LazyBase): torch.float32: np.float32, } + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_str_map: dict[str, torch.dtype] = { + "F64": torch.float64, + "F32": torch.float32, + "BF16": torch.bfloat16, + "F16": torch.float16, + # "U64": torch.uint64, + "I64": torch.int64, + # "U32": torch.uint32, + "I32": torch.int32, + # "U16": torch.uint16, + "I16": torch.int16, + "U8": torch.uint8, + "I8": torch.int8, + "BOOL": torch.bool, + "F8_E4M3": torch.float8_e4m3fn, + "F8_E5M2": torch.float8_e5m2, + } + def numpy(self) -> gguf.LazyNumpyTensor: dtype = self._dtype_map[self.dtype] return gguf.LazyNumpyTensor( meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - lazy=self._lazy, args=(self,), - func=(lambda s: s[0].numpy()) + func=(lambda s: s.numpy()) ) @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: torch.Size) -> Tensor: + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: return torch.empty(size=shape, dtype=dtype, device="meta") + @classmethod + def from_safetensors_slice(cls, st_slice: Any) -> Tensor: + dtype = cls._dtype_str_map[st_slice.get_dtype()] + shape: tuple[int, ...] = tuple(st_slice.get_shape()) + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) + return cast(torch.Tensor, lazy) + @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): del types # unused @@ -3458,7 +3661,7 @@ class LazyTorchTensor(gguf.LazyBase): if func is torch.Tensor.numpy: return args[0].numpy() - return LazyTorchTensor._wrap_fn(func)(*args, **kwargs) + return cls._wrap_fn(func)(*args, **kwargs) def parse_args() -> argparse.Namespace: @@ -3516,6 +3719,10 @@ def parse_args() -> argparse.Namespace: "--no-tensor-first-split", action="store_true", help="do not add tensors to the first split (disabled by default)" ) + parser.add_argument( + "--metadata", type=Path, + help="Specify the path for an authorship metadata override file" + ) return parser.parse_args() @@ -3541,7 +3748,10 @@ def split_str_to_n_bytes(split_str: str) -> int: def main() -> None: args = parse_args() - logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) dir_model = args.model @@ -3565,33 +3775,30 @@ def main() -> None: if args.outfile is not None: fname_out = args.outfile else: - # output in the same directory as the model by default - fname_out = dir_model / 'ggml-model-{ftype}.gguf' + fname_out = dir_model logger.info(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) with torch.inference_mode(): + output_type = ftype_map[args.outtype] + model_architecture = hparams["architectures"][0] + try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = Model.from_model_architecture(model_architecture) except NotImplementedError: - logger.error(f"Model {hparams['architectures'][0]} is not supported") + logger.error(f"Model {model_architecture} is not supported") sys.exit(1) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, - args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors, + model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out, + is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, + eager=args.no_lazy, + metadata_override=args.metadata, model_name=args.model_name, + split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split) - logger.info("Set model parameters") - model_instance.set_gguf_parameters() - - logger.info("Set model tokenizer") - model_instance.set_vocab() - - model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index e4165ae2d..d5a2d925e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum): # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome -chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' +CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' if len(sys.argv) == 2: token = sys.argv[1] @@ -91,6 +91,9 @@ models = [ {"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", }, {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, {"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", }, + {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, + {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, + {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, ] @@ -99,8 +102,8 @@ def download_file_with_auth(url, token, save_path): response = sess.get(url, headers=headers) response.raise_for_status() os.makedirs(os.path.dirname(save_path), exist_ok=True) - with open(save_path, 'wb') as f: - f.write(response.content) + with open(save_path, 'wb') as downloaded_file: + downloaded_file.write(response.content) logger.info(f"File {save_path} downloaded successfully") @@ -159,7 +162,7 @@ for model in models: logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") continue # Skip to the next model if the tokenizer can't be loaded - chktok = tokenizer.encode(chktxt) + chktok = tokenizer.encode(CHK_TXT) chkhsh = sha256(str(chktok).encode()).hexdigest() logger.info(f"model: {name}") @@ -191,7 +194,7 @@ src_func = f""" # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = {repr(chktxt)} + chktxt = {repr(CHK_TXT)} chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -287,7 +290,7 @@ tests = [ "333333333", "Cửa Việt", # llama-bpe fails on this " discards", - chktxt, + CHK_TXT, ] # write the tests to ./models/ggml-vocab-{name}.gguf.inp diff --git a/convert_llama_ggml_to_gguf.py b/convert_llama_ggml_to_gguf.py index 95ea831a5..7b00b4398 100755 --- a/convert_llama_ggml_to_gguf.py +++ b/convert_llama_ggml_to_gguf.py @@ -132,6 +132,10 @@ class Tensor: class GGMLModel: + + file_format: GGMLFormat + format_version: int + def __init__(self): self.hyperparameters = None self.vocab = None @@ -290,7 +294,7 @@ class GGMLToGGUF: if self.vocab_override is not None: vo = self.vocab_override logger.info('* Adding vocab item(s)') - for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): + for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): tokens.append(vbytes) scores.append(score) toktypes.append(ttype) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py new file mode 100755 index 000000000..a88d0d4a9 --- /dev/null +++ b/convert_lora_to_gguf.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from dataclasses import dataclass +import logging +import argparse +import os +import sys +import json +from math import prod +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast + +import torch + +if TYPE_CHECKING: + from torch import Tensor + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf + +# reuse model definitions from convert_hf_to_gguf.py +from convert_hf_to_gguf import LazyTorchTensor, Model + +logger = logging.getLogger("lora-to-gguf") + + +@dataclass +class PartialLoraTensor: + A: Tensor | None = None + B: Tensor | None = None + + +# magic to support tensor shape modifications and splitting +class LoraTorchTensor: + _lora_A: Tensor # (n_rank, row_size) + _lora_B: Tensor # (col_size, n_rank) + _rank: int + + def __init__(self, A: Tensor, B: Tensor): + assert len(A.shape) == len(B.shape) + assert A.shape[-2] == B.shape[-1] + if A.dtype != B.dtype: + A = A.to(torch.float32) + B = B.to(torch.float32) + self._lora_A = A + self._lora_B = B + self._rank = B.shape[-1] + + def get_lora_A_B(self) -> tuple[Tensor, Tensor]: + return (self._lora_A, self._lora_B) + + def __getitem__( + self, + indices: ( + SupportsIndex + | slice + | tuple[SupportsIndex | slice | Tensor, ...] # TODO: add ellipsis in the type signature + ), + ) -> LoraTorchTensor: + shape = self.shape + if isinstance(indices, SupportsIndex): + if len(shape) > 2: + return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) + else: + raise NotImplementedError # can't return a vector + elif isinstance(indices, slice): + if len(shape) > 2: + return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) + else: + return LoraTorchTensor(self._lora_A, self._lora_B[indices]) + elif isinstance(indices, tuple): + assert len(indices) > 0 + if indices[-1] is Ellipsis: + return self[indices[:-1]] + # expand ellipsis + indices = tuple( + u + for v in ( + ( + (slice(None, None) for _ in range(len(indices) - 1)) + if i is Ellipsis + else (i,) + ) + for i in indices + ) + for u in v + ) + + if len(indices) < len(shape): + indices = (*indices, *(slice(None, None) for _ in range(len(indices), len(shape)))) + + # TODO: make sure this is correct + indices_A = ( + *( + ( + j.__index__() % self._lora_A.shape[i] + if isinstance(j, SupportsIndex) + else slice(None, None) + ) + for i, j in enumerate(indices[:-2]) + ), + slice(None, None), + indices[-1], + ) + indices_B = indices[:-1] + return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) + else: + raise NotImplementedError # unknown indice type + + @property + def dtype(self) -> torch.dtype: + assert self._lora_A.dtype == self._lora_B.dtype + return self._lora_A.dtype + + @property + def shape(self) -> tuple[int, ...]: + assert len(self._lora_A.shape) == len(self._lora_B.shape) + return (*self._lora_B.shape[:-1], self._lora_A.shape[-1]) + + def size(self, dim=None): + assert dim is None + return self.shape + + def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: + if isinstance(shape[0], tuple): + new_shape: tuple[int, ...] = shape[0] + else: + new_shape = cast(tuple[int, ...], shape) + orig_shape = self.shape + if len(new_shape) < 2: + raise NotImplementedError # can't become a vector + + # expand -1 in the shape + if any(dim == -1 for dim in new_shape): + n_elems = prod(orig_shape) + n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) + assert n_elems % n_new_elems == 0 + new_shape = (*(dim if dim != -1 else n_elems // n_new_elems for dim in new_shape),) + + if new_shape[-1] != orig_shape[-1]: + raise NotImplementedError # can't reshape the row size trivially + + shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) + shape_B = (*new_shape[:-1], self._rank) + return LoraTorchTensor( + self._lora_A.reshape(shape_A), + self._lora_B.reshape(shape_B), + ) + + def reshape_as(self, other: Tensor) -> LoraTorchTensor: + return self.reshape(*other.shape) + + def view(self, *size: int) -> LoraTorchTensor: + return self.reshape(*size) + + def permute(self, *dims: int) -> LoraTorchTensor: + shape = self.shape + dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) + if dims[-1] == -1: + # TODO: support higher dimensional A shapes bigger than 1 + assert all(dim == 1 for dim in self._lora_A.shape[:-2]) + return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) + if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: + return LoraTorchTensor(self._lora_B.permute(*dims), self._lora_A.permute(*dims)) + else: + # TODO: compose the above two + raise NotImplementedError + + def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: + shape = self.shape + dims = [i for i in range(len(shape))] + dims[dim0], dims[dim1] = dims[dim1], dims[dim0] + return self.permute(*dims) + + def swapaxes(self, axis0: int, axis1: int) -> LoraTorchTensor: + return self.transpose(axis0, axis1) + + def to(self, *args, **kwargs): + return LoraTorchTensor(self._lora_A.to(*args, **kwargs), self._lora_B.to(*args, **kwargs)) + + @classmethod + def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.permute: + return type(args[0]).permute(*args, **kwargs) + elif func is torch.reshape: + return type(args[0]).reshape(*args, **kwargs) + elif func is torch.stack: + assert isinstance(args[0], Sequence) + dim = kwargs.get("dim", 0) + assert dim == 0 + return LoraTorchTensor( + torch.stack([a._lora_A for a in args[0]], dim), + torch.stack([b._lora_B for b in args[0]], dim), + ) + elif func is torch.cat: + assert isinstance(args[0], Sequence) + dim = kwargs.get("dim", 0) + assert dim == 0 + if len(args[0][0].shape) > 2: + return LoraTorchTensor( + torch.cat([a._lora_A for a in args[0]], dim), + torch.cat([b._lora_B for b in args[0]], dim), + ) + elif all(torch.equal(args[0][0]._lora_A, t._lora_A) for t in args[0][1:]): + return LoraTorchTensor( + args[0][0]._lora_A, + torch.cat([b._lora_B for b in args[0]], dim), + ) + else: + raise NotImplementedError + else: + raise NotImplementedError + + +def get_base_tensor_name(lora_tensor_name: str) -> str: + base_name = lora_tensor_name.replace("base_model.model.", "") + base_name = base_name.replace(".lora_A.weight", ".weight") + base_name = base_name.replace(".lora_B.weight", ".weight") + return base_name + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a huggingface PEFT LoRA adapter to a GGML compatible file") + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "--no-lazy", action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + parser.add_argument( + "--dry-run", action="store_true", + help="only print out what will be done, without writing any new files", + ) + parser.add_argument( + "--base", type=Path, required=True, + help="directory containing base model file", + ) + parser.add_argument( + "lora_path", type=Path, + help="directory containing LoRA adapter file", + ) + + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "auto": gguf.LlamaFileType.GUESSED, + } + + ftype = ftype_map[args.outtype] + + dir_base_model: Path = args.base + dir_lora: Path = args.lora_path + lora_config = dir_lora / "adapter_config.json" + input_model = dir_lora / "adapter_model.safetensors" + + if args.outfile is not None: + fname_out = args.outfile + else: + # output in the same directory as the model by default + fname_out = dir_lora + + if os.path.exists(input_model): + # lazy import load_file only if lora is in safetensors format. + from safetensors.torch import load_file + + lora_model = load_file(input_model, device="cpu") + else: + input_model = os.path.join(dir_lora, "adapter_model.bin") + lora_model = torch.load(input_model, map_location="cpu", weights_only=True) + + # load base model + logger.info(f"Loading base model: {dir_base_model.name}") + hparams = Model.load_hparams(dir_base_model) + with torch.inference_mode(): + try: + model_class = Model.from_model_architecture(hparams["architectures"][0]) + except NotImplementedError: + logger.error(f"Model {hparams['architectures'][0]} is not supported") + sys.exit(1) + + class LoraModel(model_class): + model_arch = model_class.model_arch + + lora_alpha: float + + def __init__(self, *args, dir_lora_model: Path, lora_alpha: float, **kwargs): + + super().__init__(*args, **kwargs) + + self.dir_model_card = dir_lora_model + self.lora_alpha = float(lora_alpha) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) + self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + + def set_gguf_parameters(self): + self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha) + super().set_gguf_parameters() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + tensor_map: dict[str, PartialLoraTensor] = {} + + for name, tensor in lora_model.items(): + if self.lazy: + tensor = LazyTorchTensor.from_eager(tensor) + base_name = get_base_tensor_name(name) + is_lora_a = ".lora_A.weight" in name + is_lora_b = ".lora_B.weight" in name + if not is_lora_a and not is_lora_b: + if ".base_layer.weight" in name: + continue + logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") + sys.exit(1) + + if base_name in tensor_map: + if is_lora_a: + tensor_map[base_name].A = tensor + else: + tensor_map[base_name].B = tensor + else: + if is_lora_a: + tensor_map[base_name] = PartialLoraTensor(A=tensor) + else: + tensor_map[base_name] = PartialLoraTensor(B=tensor) + + for name, tensor in tensor_map.items(): + assert tensor.A is not None + assert tensor.B is not None + yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B))) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + dest = super().modify_tensors(data_torch, name, bid) + for dest_name, dest_data in dest: + assert isinstance(dest_data, LoraTorchTensor) + lora_a, lora_b = dest_data.get_lora_A_B() + + yield (dest_name + ".lora_a", lora_a) + yield (dest_name + ".lora_b", lora_b) + + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + + alpha: float = lparams["lora_alpha"] + + model_instance = LoraModel( + dir_base_model, + ftype, + fname_out, + is_big_endian=args.bigendian, + use_temp_file=False, + eager=args.no_lazy, + dry_run=args.dry_run, + dir_lora_model=dir_lora, + lora_alpha=alpha, + ) + + logger.info("Exporting model...") + model_instance.write() + logger.info(f"Model successfully exported to {model_instance.fname_out}") diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 885983e92..d36ac0a15 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow ```sh ./build/bin/llama-ls-sycl-device ``` -A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: +This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` -found 6 SYCL devices: +found 2 SYCL devices: + | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| -| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| -| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| -| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -| Attribute | Note | -|------------------------|-------------------------------------------------------------| -| compute capability 1.3 | Level-zero driver/runtime, recommended | -| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases | 4. Launch inference There are two device selection modes: - Single device: Use one device target specified by the user. -- Multiple devices: Automatically select the devices with the same largest Max compute-units. +- Multiple devices: Automatically choose the devices with the same backend. + +In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. | Device selection | Parameter | |------------------|----------------------------------------| @@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow build\bin\ls-sycl-device.exe ``` -The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following: +This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` -found 6 SYCL devices: +found 2 SYCL devices: | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| -| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| -| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| -| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -| Attribute | Note | -|------------------------|-----------------------------------------------------------| -| compute capability 1.3 | Level-zero running time, recommended | -| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases | - 4. Launch inference There are two device selection modes: -- Single device: Use one device assigned by user. -- Multiple devices: Automatically choose the devices with the same biggest Max compute units. +- Single device: Use one device assigned by user. Default device id is 0. +- Multiple devices: Automatically choose the devices with the same backend. + +In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. | Device selection | Parameter | |------------------|----------------------------------------| diff --git a/docs/build.md b/docs/build.md index 916fcf22d..8b16d1a35 100644 --- a/docs/build.md +++ b/docs/build.md @@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options. make ``` - - On Windows: + - On Windows (x86/x64 only, arm64 requires cmake): 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 2. Extract `w64devkit` on your pc. @@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options. cmake -B build -G "Xcode" cmake --build build --config Debug ``` + - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: + - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): + - Tab Workload: Desktop-development with C++ + - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang) + - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test + - For Windows on ARM (arm64, WoA) build with: + ```bash + cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF + cmake --build build-arm64-windows-llvm-release + ``` + Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels. - Using `gmake` (FreeBSD): @@ -167,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht cmake --build build --config Release ``` -The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: +The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. + +The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`. + +The following compilation options are also available to tweak performance: | Option | Legal values | Default | Description | |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| @@ -181,6 +196,19 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | +### MUSA + +- Using `make`: + ```bash + make GGML_MUSA=1 + ``` +- Using `CMake`: + + ```bash + cmake -B build -DGGML_MUSA=ON + cmake --build build --config Release + ``` + ### hipBLAS This provides BLAS acceleration on HIP-supported AMD GPUs. diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 2712b66c1..04c5ccbbe 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -9,15 +9,15 @@ Adding a model requires few steps: After following these steps, you can open PR. Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially: -- [main](../examples/main) -- [imatrix](../examples/imatrix) -- [quantize](../examples/quantize) -- [server](../examples/server) +- [main](/examples/main/) +- [imatrix](/examples/imatrix/) +- [quantize](/examples/quantize/) +- [server](/examples/server/) ### 1. Convert the model to GGUF This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. -Depending on the model architecture, you can use either [convert_hf_to_gguf.py](../convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](../examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format). +Depending on the model architecture, you can use either [convert_hf_to_gguf.py](/convert_hf_to_gguf.py) or [examples/convert_legacy_llama.py](/examples/convert_legacy_llama.py) (for `llama/llama2` models in `.pth` format). The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. @@ -31,7 +31,7 @@ class MyModel(Model): model_arch = gguf.MODEL_ARCH.GROK ``` -2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py) +2. Define the layout of the GGUF tensors in [constants.py](/gguf-py/gguf/constants.py) Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`. @@ -54,7 +54,7 @@ Example for `falcon` model: As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist. -Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file. +Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](/gguf-py/gguf/tensor_mapping.py) file. If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it. @@ -100,7 +100,7 @@ Have a look at existing implementation like `build_llama`, `build_dbrx` or `buil When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. -Note: to debug the inference graph: you can use [llama-eval-callback](../examples/eval-callback). +Note: to debug the inference graph: you can use [llama-eval-callback](/examples/eval-callback/). ## GGUF specification diff --git a/docs/development/token_generation_performance_tips.md b/docs/development/token_generation_performance_tips.md index c0840cad5..41b7232c9 100644 --- a/docs/development/token_generation_performance_tips.md +++ b/docs/development/token_generation_performance_tips.md @@ -1,7 +1,7 @@ # Token generation performance troubleshooting ## Verifying that the model is running on the GPU with CUDA -Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: +Make sure you compiled llama with the correct env variables according to [this guide](/docs/build.md#cuda), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell ./llama-cli -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 155743639..67b3d2774 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,7 +21,6 @@ else() add_subdirectory(embedding) add_subdirectory(eval-callback) add_subdirectory(export-lora) - add_subdirectory(finetune) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -53,5 +52,4 @@ else() add_subdirectory(simple) add_subdirectory(speculative) add_subdirectory(tokenize) - add_subdirectory(train-text-from-scratch) endif() diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 4f6c3746a..aca332e94 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1,7 +1,6 @@ #include "ggml.h" #include "train.h" -#include #include #include #include diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 718f0a61a..25e7c775a 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -69,7 +69,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_params_from_gpt_params(params); // ensure enough sequences are available - ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); + ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end()); llama_context * ctx = llama_new_context_with_model(model, ctx_params); diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 2442e954d..53fbfb0a8 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -31,7 +31,7 @@ int main(int argc, char ** argv) { int n_parallel = params.n_parallel; // total length of the sequences including the prompt - int n_predict = 32; + int n_predict = params.n_predict; // init LLM diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index c2c73e8ad..9ab9ab06e 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -24,7 +24,7 @@ from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional +from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar import numpy as np @@ -346,42 +346,6 @@ class Params: return params -@dataclass -class Metadata: - name: Optional[str] = None - author: Optional[str] = None - version: Optional[str] = None - url: Optional[str] = None - description: Optional[str] = None - license: Optional[str] = None - source_url: Optional[str] = None - source_hf_repo: Optional[str] = None - - @staticmethod - def load(metadata_path: Path) -> Metadata: - if metadata_path is None or not metadata_path.exists(): - return Metadata() - - with open(metadata_path, 'r') as file: - data = json.load(file) - - # Create a new Metadata instance - metadata = Metadata() - - # Assigning values to Metadata attributes if they exist in the JSON file - # This is based on LLM_KV_NAMES mapping in llama.cpp - metadata.name = data.get("general.name") - metadata.author = data.get("general.author") - metadata.version = data.get("general.version") - metadata.url = data.get("general.url") - metadata.description = data.get("general.description") - metadata.license = data.get("general.license") - metadata.source_url = data.get("general.source.url") - metadata.source_hf_repo = data.get("general.source.huggingface.repository") - - return metadata - - # # data loading # TODO: reuse (probably move to gguf.py?) @@ -806,7 +770,7 @@ class OutputFile: def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - def add_meta_model(self, params: Params, metadata: Metadata | None) -> None: + def add_meta_model(self, params: Params, metadata: gguf.Metadata | None) -> None: # Metadata About The Model And Its Provenence name = "LLaMA" if metadata is not None and metadata.name is not None: @@ -824,16 +788,73 @@ class OutputFile: self.gguf.add_author(metadata.author) if metadata.version is not None: self.gguf.add_version(metadata.version) - if metadata.url is not None: - self.gguf.add_url(metadata.url) + if metadata.organization is not None: + self.gguf.add_organization(metadata.organization) + + if metadata.finetune is not None: + self.gguf.add_finetune(metadata.finetune) + if metadata.basename is not None: + self.gguf.add_basename(metadata.basename) + if metadata.description is not None: self.gguf.add_description(metadata.description) + if metadata.quantized_by is not None: + self.gguf.add_quantized_by(metadata.quantized_by) + + if metadata.size_label is not None: + self.gguf.add_size_label(metadata.size_label) + if metadata.license is not None: - self.gguf.add_licence(metadata.license) + self.gguf.add_license(metadata.license) + if metadata.license_name is not None: + self.gguf.add_license_name(metadata.license_name) + if metadata.license_link is not None: + self.gguf.add_license_link(metadata.license_link) + + if metadata.url is not None: + self.gguf.add_url(metadata.url) + if metadata.doi is not None: + self.gguf.add_doi(metadata.doi) + if metadata.uuid is not None: + self.gguf.add_uuid(metadata.uuid) + if metadata.repo_url is not None: + self.gguf.add_repo_url(metadata.repo_url) + if metadata.source_url is not None: self.gguf.add_source_url(metadata.source_url) - if metadata.source_hf_repo is not None: - self.gguf.add_source_hf_repo(metadata.source_hf_repo) + if metadata.source_doi is not None: + self.gguf.add_source_doi(metadata.source_doi) + if metadata.source_uuid is not None: + self.gguf.add_source_uuid(metadata.source_uuid) + if metadata.source_repo_url is not None: + self.gguf.add_source_repo_url(metadata.source_repo_url) + + if metadata.base_models is not None: + self.gguf.add_base_model_count(len(metadata.base_models)) + for key, base_model_entry in enumerate(metadata.base_models): + if "name" in base_model_entry: + self.gguf.add_base_model_name(key, base_model_entry["name"]) + if "author" in base_model_entry: + self.gguf.add_base_model_author(key, base_model_entry["author"]) + if "version" in base_model_entry: + self.gguf.add_base_model_version(key, base_model_entry["version"]) + if "organization" in base_model_entry: + self.gguf.add_base_model_organization(key, base_model_entry["organization"]) + if "url" in base_model_entry: + self.gguf.add_base_model_url(key, base_model_entry["url"]) + if "doi" in base_model_entry: + self.gguf.add_base_model_doi(key, base_model_entry["doi"]) + if "uuid" in base_model_entry: + self.gguf.add_base_model_uuid(key, base_model_entry["uuid"]) + if "repo_url" in base_model_entry: + self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"]) + + if metadata.tags is not None: + self.gguf.add_tags(metadata.tags) + if metadata.languages is not None: + self.gguf.add_languages(metadata.languages) + if metadata.datasets is not None: + self.gguf.add_datasets(metadata.datasets) def add_meta_arch(self, params: Params) -> None: # Metadata About The Neural Architecture Itself @@ -944,7 +965,7 @@ class OutputFile: @staticmethod def write_vocab_only( fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: gguf.Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -978,7 +999,7 @@ class OutputFile: fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, - metadata: Metadata | None = None, + metadata: gguf.Metadata | None = None, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1021,35 +1042,32 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise ValueError(f"Unexpected combination of types: {name_to_type}") -def model_parameter_count(model: LazyModel) -> int: - total_model_parameters = 0 - for i, (name, lazy_tensor) in enumerate(model.items()): - sum_weights_in_tensor = 1 +def per_model_weight_count_estimation(tensors: Iterable[tuple[str, LazyTensor]]) -> tuple[int, int, int]: + total_params = 0 + shared_params = 0 + expert_params = 0 + + for name, lazy_tensor in tensors: + # We don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): + continue + + # Got A Tensor + sum_weights_in_tensor: int = 1 + + # Tensor Volume for dim in lazy_tensor.shape: sum_weights_in_tensor *= dim - total_model_parameters += sum_weights_in_tensor - return total_model_parameters + if ".experts." in name: + if ".experts.0." in name: + expert_params += sum_weights_in_tensor + else: + shared_params += sum_weights_in_tensor -def model_parameter_count_rounded_notation(model_params_count: int) -> str: - if model_params_count > 1e12 : - # Trillions Of Parameters - scaled_model_params = model_params_count * 1e-12 - scale_suffix = "T" - elif model_params_count > 1e9 : - # Billions Of Parameters - scaled_model_params = model_params_count * 1e-9 - scale_suffix = "B" - elif model_params_count > 1e6 : - # Millions Of Parameters - scaled_model_params = model_params_count * 1e-6 - scale_suffix = "M" - else: - # Thousands Of Parameters - scaled_model_params = model_params_count * 1e-3 - scale_suffix = "K" + total_params += sum_weights_in_tensor - return f"{round(scaled_model_params)}{scale_suffix}" + return total_params, shared_params, expert_params def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: @@ -1231,34 +1249,24 @@ class VocabFactory: return vocab, special_vocab -def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str: - quantization = { +def default_convention_outfile(file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> str: + name = metadata.name if metadata.name is not None else None + basename = metadata.basename if metadata.basename is not None else None + finetune = metadata.finetune if metadata.finetune is not None else None + version = metadata.version if metadata.version is not None else None + size_label = metadata.size_label if metadata.size_label is not None else gguf.size_label(*model_params_count, expert_count=expert_count or 0) + + output_type = { GGMLFileType.AllF32: "F32", GGMLFileType.MostlyF16: "F16", GGMLFileType.MostlyQ8_0: "Q8_0", }[file_type] - parameters = model_parameter_count_rounded_notation(model_params_count) - - expert_count = "" - if params.n_experts is not None: - expert_count = f"{params.n_experts}x" - - version = "" - if metadata is not None and metadata.version is not None: - version = f"-{metadata.version}" - - name = "ggml-model" - if metadata is not None and metadata.name is not None: - name = metadata.name - elif params.path_model is not None: - name = params.path_model.name - - return f"{name}{version}-{expert_count}{parameters}-{quantization}" + return gguf.naming_convention(name, basename, finetune, version, size_label, output_type) -def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path: - default_filename = default_convention_outfile(file_type, params, model_params_count, metadata) +def default_outfile(model_paths: list[Path], file_type: GGMLFileType, expert_count: int | None, model_params_count: tuple[int, int, int], metadata: gguf.Metadata) -> Path: + default_filename = default_convention_outfile(file_type, expert_count, model_params_count, metadata) ret = model_paths[0].parent / f"{default_filename}.gguf" if ret in model_paths: logger.error( @@ -1297,8 +1305,9 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") parser.add_argument("--verbose", action="store_true", help="increase output verbosity") - parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file") + parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file") parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name") + parser.add_argument("--model-name", type=str, default=None, help="name of the model") args = parser.parse_args(args_in) @@ -1310,32 +1319,36 @@ def main(args_in: list[str] | None = None) -> None: else: logging.basicConfig(level=logging.INFO) - metadata = Metadata.load(args.metadata) + model_name = args.model_name + dir_model = args.model + + metadata = gguf.Metadata.load(args.metadata, dir_model, model_name) if args.get_outfile: - model_plus = load_some_model(args.model) + model_plus = load_some_model(dir_model) params = Params.load(model_plus) - model = convert_model_names(model_plus.model, params, args.skip_unknown) - model_params_count = model_parameter_count(model_plus.model) - ftype = pick_output_type(model, args.outtype) - print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100 + model = convert_model_names(model_plus.model, params, args.skip_unknown) + model_params_count = per_model_weight_count_estimation(model_plus.model.items()) + ftype = pick_output_type(model, args.outtype) + + if (metadata is None or metadata.name is None) and params.path_model is not None: + metadata.name = params.path_model.name + + print(f"{default_convention_outfile(ftype, params.n_experts, model_params_count, metadata)}") # noqa: NP100 return if args.no_vocab and args.vocab_only: raise ValueError("--vocab-only does not make sense with --no-vocab") if args.dump_single: - model_plus = lazy_load_file(args.model) + model_plus = lazy_load_file(dir_model) do_dump_model(model_plus) return if not args.vocab_only: - model_plus = load_some_model(args.model) + model_plus = load_some_model(dir_model) else: - model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) - - model_params_count = model_parameter_count(model_plus.model) - logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})") + model_plus = ModelPlus(model = {}, paths = [dir_model / 'dummy'], format = 'none', vocab = None) if args.dump: do_dump_model(model_plus) @@ -1368,7 +1381,7 @@ def main(args_in: list[str] | None = None) -> None: logger.info(f"params = {params}") model_parent_path = model_plus.paths[0].parent - vocab_path = Path(args.vocab_dir or args.model or model_parent_path) + vocab_path = Path(args.vocab_dir or dir_model or model_parent_path) vocab_factory = VocabFactory(vocab_path) vocab_types = None if args.no_vocab else args.vocab_type.split(",") vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path) @@ -1399,13 +1412,21 @@ def main(args_in: list[str] | None = None) -> None: assert params is not None + if metadata.name is None and params.path_model is not None: + metadata.name = params.path_model.name + + model_params_count = per_model_weight_count_estimation(model_plus.model.items()) + logger.info(f"model parameters count : {model_params_count} ({gguf.model_weight_count_rounded_notation(model_params_count[0])})") + logger.info(f"Vocab info: {vocab}") logger.info(f"Special vocab info: {special_vocab}") model = model_plus.model model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata) + outfile = args.outfile or default_outfile(model_plus.paths, ftype, params.n_experts, model_params_count, metadata=metadata) + + metadata.size_label = gguf.size_label(*model_params_count, expert_count=params.n_experts or 0) params.ftype = ftype logger.info(f"Writing {outfile}, format {ftype}") diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index d4e126ac2..a12e90d82 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -414,9 +414,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model to get hparams - llama_model * model; - llama_context * ctx; - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // int n_ctx = llama_n_ctx(ctx); int n_layers = llama_n_layer(model); diff --git a/examples/deprecation-warning/README.md b/examples/deprecation-warning/README.md index 1e20feb4a..59918ec2b 100644 --- a/examples/deprecation-warning/README.md +++ b/examples/deprecation-warning/README.md @@ -13,7 +13,6 @@ Please update all scripts and workflows to use the new binary names. | server | llama-server | | llama-bench | llama-bench | | embedding | llama-embedding | -| finetune | llama-finetune | | quantize | llama-quantize | | tokenize | llama-tokenize | | export-lora | llama-export-lora | @@ -45,7 +44,6 @@ Please update all scripts and workflows to use the new binary names. | save-load-state | llama-save-load-state | | simple | llama-simple | | speculative | llama-speculative | -| train-text-from-scratch | llama-train-text-from-scratch | | vdot | llama-vdot | | tests/test-c.o | tests/test-c.o | diff --git a/examples/embedding/README.md b/examples/embedding/README.md index e3705b454..12b372bf1 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor ### Unix-based systems (Linux, macOS, etc.): ```bash -./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null +./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null ``` ### Windows: ```powershell -llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null +llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null ``` The above command will output space-separated float values. @@ -50,11 +50,11 @@ The above command will output space-separated float values. ### Unix-based systems (Linux, macOS, etc.): ```bash -./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` ### Windows: ```powershell -embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null +llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 1466e5b2b..cd7b448a6 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -79,11 +79,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model; - llama_context * ctx; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index c8a3016a4..ef35ba2c0 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else if (type == GGML_TYPE_I8) { v = (float) *(int8_t *) &data[i]; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } printf("%12.4f", v); sum += v; @@ -163,9 +163,10 @@ int main(int argc, char ** argv) { params.warmup = false; // init - llama_model * model; - llama_context * ctx; - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); return 1; diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md index 1fb17feec..91c33c34a 100644 --- a/examples/export-lora/README.md +++ b/examples/export-lora/README.md @@ -6,12 +6,11 @@ Apply LORA adapters to base model and export the resulting model. usage: llama-export-lora [options] options: - -h, --help show this help message and exit - -m FNAME, --model-base FNAME model path from which to load base model (default '') - -o FNAME, --model-out FNAME path to save exported model (default '') - -l FNAME, --lora FNAME apply LoRA adapter - -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S - -t N, --threads N number of threads to use during computation (default: 4) + -m, --model model path from which to load base model (default '') + --lora FNAME path to LoRA adapter (can be repeated to use multiple adapters) + --lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters) + -t, --threads N number of threads to use during computation (default: 4) + -o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf') ``` For example: @@ -20,7 +19,15 @@ For example: ./bin/llama-export-lora \ -m open-llama-3b-v2-q8_0.gguf \ -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \ - -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin + --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf ``` -Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters. +Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: + +```bash +./bin/llama-export-lora \ + -m your_base_model.gguf \ + -o your_merged_model.gguf \ + --lora-scaled lora_task_A.gguf 0.5 \ + --lora-scaled lora_task_B.gguf 0.5 +``` diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 08413f57e..3176d6e26 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,462 +1,406 @@ - #include "common.h" #include "ggml.h" #include "ggml-alloc.h" +#include #include #include #include +#include -struct lora_info { - std::string filename; +static bool g_verbose = false; + +static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){ + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id)); +} + +static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) { + int id = gguf_find_key(ctx_gguf, key.c_str()); + return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id); +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +static std::string ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) { + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ ctx_ggml, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params); + if (!ctx_gguf) { + throw std::runtime_error("failed to load input GGUF from " + fname); + } + return ctx_gguf; +} + +struct file_input { + struct ggml_context * ctx_meta = nullptr; + struct gguf_context * ctx_gguf = nullptr; + std::ifstream f_in; + std::map tensors; + float alpha; float scale; + + file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) { + if (!f_in.is_open()) { + throw std::runtime_error("failed to open input gguf from " + fname); + } + + ctx_gguf = load_gguf(fname, &ctx_meta); + alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha"); + printf("%s: loaded gguf from %s\n", __func__, fname.c_str()); + + for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) { + std::string name(cur->name); + tensors[name] = cur; + if (g_verbose) { + printf("%s: %s\n", __func__, cur->name); + } + } + } + + ggml_tensor * get_tensor(std::string name) { + if (tensors.find(name) == tensors.end()) { + return nullptr; + } + return tensors[name]; + } + + void read_tensor_data(std::string name, std::vector & buf) { + if (tensors.find(name) == tensors.end()) { + throw std::runtime_error("cannot find tensor with name: " + name); + } + auto len = ggml_nbytes(tensors[name]); + if (buf.size() < len) { + buf.resize(len); + } + auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + f_in.seekg(offset); + f_in.read((char* )buf.data(), len); + } + + ~file_input() { + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + } }; -struct export_lora_params { - std::string fn_model_base; - std::string fn_model_out; - std::vector lora; +struct lora_merge_ctx { + // input base model + adapters + file_input base_model; + std::vector> adapters; + + // for computing merged tensor int n_threads; -}; + ggml_backend_t backend = nullptr; + ggml_gallocr_t allocr = nullptr; + std::vector read_buf; -struct lora_data { - struct lora_info info; - std::vector data; - struct ggml_context * ctx; + // output file + struct gguf_context * ctx_out; + struct ggml_context * ctx_out_ggml; + std::ofstream fout; - uint32_t lora_r; - uint32_t lora_alpha; -}; + lora_merge_ctx( + std::string & base_fname, + std::vector & lora_files, + std::string & outfile, + int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { + fout.exceptions(std::ofstream::failbit); // fail fast on write errors -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; + if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) { + throw std::runtime_error("split model is not yet supported"); + } - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; + for (auto & lora_inp : lora_files) { + auto fname = lora_inp.path; + auto scale = lora_inp.scale; + std::unique_ptr adapter(new file_input(fname, scale)); + check_metadata_lora(adapter.get()); + adapters.push_back(std::move(adapter)); + } + + ctx_out = gguf_init_empty(); + struct ggml_init_params params = { + /*.mem_size =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ctx_out_ggml = ggml_init(params); + backend = ggml_backend_cpu_init(); + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + } + + void check_metadata_lora(file_input * adapter) { + auto general_type = get_kv_str(adapter->ctx_gguf, "general.type"); + if (general_type != "adapter") { + throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type); + } + + auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type"); + if (adapter_type != "lora") { + throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type); + } + + auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture"); + auto general_arch_lora = get_kv_str(adapter->ctx_gguf, "general.architecture"); + if (general_arch_base != general_arch_lora) { + throw std::runtime_error("model arch and LoRA arch mismatch"); + } + } + + ggml_type get_out_tensor_type(struct ggml_tensor * t) { + if (t->type == GGML_TYPE_F32) { + return GGML_TYPE_F32; } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); + return GGML_TYPE_F16; } } - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } + void run_merge() { + // prepare metadata + gguf_set_kv(ctx_out, base_model.ctx_gguf); + // output is forced to f16 for now + gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16); - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; + // check if all lora adapters have the same tensors + // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777 + static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once."; + if (adapters.size() > 1) { + for (size_t i = 1; i < adapters.size(); ++i) { + if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) { + throw std::runtime_error(err_no_subset_adapter); + } + for (auto & it : adapters[i]->tensors) { + if (adapters[0]->get_tensor(it.first) == nullptr) { + throw std::runtime_error(err_no_subset_adapter); + } + } + } } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - die_fmt("read error: %s", strerror(errno)); + + // mapping base tensor to out tensor (same shape with base, but different type) + // if out_tensor == nullptr, we only copy it + std::vector> base_to_out_tensors; + for (auto & it : base_model.tensors) { + bool t_a = true; + bool t_b = true; + for (auto & adapter : adapters) { + t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a"); + t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b"); + } + auto base_tensor = it.second; + if (!t_a && !t_b) { + // only copy + struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + ggml_set_name(cpy_tensor, base_tensor->name); + base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr)); + gguf_add_tensor(ctx_out, cpy_tensor); + } else if (t_a && t_b) { + // need merging + struct ggml_tensor * out_tensor = ggml_new_tensor( + ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); + ggml_set_name(out_tensor, base_tensor->name); + base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor)); + gguf_add_tensor(ctx_out, out_tensor); + } else { + throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); + } } - if (ret != 1) { - die("unexpectedly reached end of file"); + + // placeholder for the meta data + { + size_t meta_size = gguf_get_meta_size(ctx_out); + zeros(fout, meta_size); } - } - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; + // process base model tensors + size_t n_merged = 0; + for (auto & it : base_to_out_tensors) { + if (it.second != nullptr) { + merge_tensor(it.first, it.second); + n_merged++; + } else { + copy_tensor(it.first); + } } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - die_fmt("write error: %s", strerror(errno)); + + // write output metadata + { + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.seekp(0); + fout.write((const char *)data.data(), data.size()); } + + printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); + printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size()); } - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); + void copy_tensor(struct ggml_tensor * base) { + printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); + size_t len = ggml_nbytes(base); + base_model.read_tensor_data(base->name, read_buf); + fout.write((char* )read_buf.data(), len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); } - bool eof() { - return tell() >= size; - } + void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) { + std::string name_base(base->name); + std::string name_lora_a = name_base + ".lora_a"; + std::string name_lora_b = name_base + ".lora_b"; - ~llama_file() { - if (fp) { - std::fclose(fp); + printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str()); + + // context for input tensor + std::vector inp_a(adapters.size()); + std::vector inp_b(adapters.size()); + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(params); + + // alloc tensors + struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne); + for (size_t i = 0; i < adapters.size(); ++i) { + auto t_a = adapters[i]->get_tensor(name_lora_a); + auto t_b = adapters[i]->get_tensor(name_lora_b); + inp_a[i] = ggml_dup_tensor(ctx, t_a); + inp_b[i] = ggml_dup_tensor(ctx, t_b); } + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + // load base tensor to backend buffer + base_model.read_tensor_data(name_base, read_buf); + if (base->type != GGML_TYPE_F32) { + // optionally dequantize it + printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); + auto nels = ggml_nelements(inp_base); + ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type); + std::vector dequant_buf(nels * sizeof(float)); + qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); + } else { + ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); + } + + // load lora tensors to backend buffer + for (size_t i = 0; i < adapters.size(); ++i) { + adapters[i]->read_tensor_data(name_lora_a, read_buf); + ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i])); + adapters[i]->read_tensor_data(name_lora_b, read_buf); + ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i])); + } + + // build graph + struct ggml_cgraph * gf; + { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx0 = ggml_init(params0); + gf = ggml_new_graph(ctx0); + struct ggml_tensor * cur = inp_base; + for (size_t i = 0; i < adapters.size(); ++i) { + struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))); + struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32)); + // scale + const float alpha = adapters[i]->alpha; + const float rank = (float) inp_b[i]->ne[0]; + const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; + delta = ggml_scale(ctx0, delta, scale); + cur = ggml_add(ctx0, delta, cur); + printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); + printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); + } + cur = ggml_cast(ctx0, cur, out->type); + printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type)); + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + } + + // compute + { + ggml_gallocr_alloc_graph(allocr, gf); + ggml_backend_cpu_set_n_threads(backend, n_threads); + ggml_backend_graph_compute(backend, gf); + } + + // write data to output file + { + auto result = gf->nodes[gf->n_nodes - 1]; + size_t len = ggml_nbytes(result); + if (read_buf.size() < len) { + read_buf.resize(len); + } + ggml_backend_tensor_get(result, read_buf.data(), 0, len); + fout.write((char* )read_buf.data(), len); + zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); + } + + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + } + + ~lora_merge_ctx() { + ggml_gallocr_free(allocr); + ggml_backend_free(backend); + gguf_free(ctx_out); + ggml_free(ctx_out_ggml); } }; -static struct export_lora_params get_default_export_lora_params() { - struct export_lora_params result; - result.fn_model_base = ""; - result.fn_model_out = ""; - result.n_threads = GGML_DEFAULT_N_THREADS; - return result; -} +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); -static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str()); - fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str()); - fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n"); - fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads); -} - -static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) { - bool invalid_param = false; - std::string arg; - struct export_lora_params default_params = get_default_export_lora_params(); - const std::string arg_prefix = "--"; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (arg == "-m" || arg == "--model-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_base = argv[i]; - } else if (arg == "-o" || arg == "--model-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_out = argv[i]; - } else if (arg == "-l" || arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - struct lora_info lora; - lora.filename = argv[i]; - lora.scale = 1.0f; - params->lora.push_back(lora); - } else if (arg == "-s" || arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - break; - } - struct lora_info lora; - lora.filename = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - lora.scale = std::stof(argv[i]); - params->lora.push_back(lora); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_threads = std::stoi(argv[i]); - if (params->n_threads <= 0) { - params->n_threads = std::thread::hardware_concurrency(); - } - } else { - fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str()); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - } - - if (params->fn_model_base == default_params.fn_model_base) { - fprintf(stderr, "error: please specify a filename for model-base.\n"); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - if (params->fn_model_out == default_params.fn_model_out) { - fprintf(stderr, "error: please specify a filename for model-out.\n"); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str()); - export_lora_print_usage(argc, argv, &default_params); - exit(1); - } - return true; -} - -static void free_lora(struct lora_data * lora) { - if (lora->ctx != NULL) { - ggml_free(lora->ctx); - } - delete lora; -} - -static struct lora_data * load_lora(struct lora_info * info) { - struct lora_data * result = new struct lora_data; - result->info = *info; - result->ctx = NULL; - result->lora_r = 1; - result->lora_alpha = 1; - - struct llama_file file(info->filename.c_str(), "rb"); - if (file.fp == NULL) { - fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n", - info->filename.c_str()); - free_lora(result); - return NULL; - } - - struct ggml_init_params params_ggml; - params_ggml.mem_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE; - params_ggml.mem_buffer = NULL; - params_ggml.no_alloc = true; - result->ctx = ggml_init(params_ggml); - - uint32_t magic = file.read_u32(); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); - } - uint32_t version = file.read_u32(); - if (version != 1) { - die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str()); - } - result->lora_r = file.read_u32(); - result->lora_alpha = file.read_u32(); - // read tensor infos from file - std::vector name_buf; - std::vector tensors; - std::vector tensors_offset; - size_t total_nbytes_pad = 0; - while(!file.eof()) { - int64_t ne[4] = {1,1,1,1}; - uint32_t n_dims = file.read_u32(); - uint32_t namelen = file.read_u32(); - uint32_t type = file.read_u32(); - for (uint32_t k = 0; k < n_dims; ++k) { - ne[k] = (int64_t)file.read_u32(); - } - name_buf.clear(); - name_buf.resize(namelen + 1, '\0'); - file.read_raw(name_buf.data(), namelen); - file.seek((0-file.tell()) & 31, SEEK_CUR); - size_t offset = file.tell(); - struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne); - ggml_set_name(tensor, name_buf.data()); - size_t nbytes = ggml_nbytes(tensor); - size_t nbytes_pad = ggml_nbytes_pad(tensor); - total_nbytes_pad += nbytes_pad; - tensors.push_back(tensor); - tensors_offset.push_back(offset); - file.seek(nbytes, SEEK_CUR); - } - // read tensor data - result->data.resize(total_nbytes_pad); - size_t data_offset = 0; - for (size_t i = 0; i < tensors.size(); ++i) { - struct ggml_tensor * tensor = tensors[i]; - size_t offset = tensors_offset[i]; - size_t nbytes = ggml_nbytes(tensor); - size_t nbytes_pad = ggml_nbytes_pad(tensor); - file.seek(offset, SEEK_SET); - tensor->data = result->data.data() + data_offset; - file.read_raw(tensor->data, nbytes); - data_offset += nbytes_pad; - } - return result; -} - - -static struct ggml_cgraph * build_graph_lora( - struct ggml_context * ctx, - struct ggml_tensor * tensor, - struct ggml_tensor * lora_a, - struct ggml_tensor * lora_b, - float scaling -) { - struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b); - if (scaling != 1.0f) { - ab = ggml_scale(ctx, ab, scaling); - } - struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab); - - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand (gf, res); - return gf; -} - -static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) { - if (lora->ctx == NULL) { - return false; - } - std::string name = ggml_get_name(tensor); - std::string name_a = name + std::string(".loraA"); - std::string name_b = name + std::string(".loraB"); - struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str()); - struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str()); - if (lora_a == NULL || lora_b == NULL) { - return false; - } - - float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r; - - struct ggml_init_params params; - params.mem_size = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5; - params.mem_buffer = NULL; - params.no_alloc = true; - struct ggml_context * ctx = NULL; - struct ggml_gallocr * alloc = NULL; - struct ggml_cgraph * gf = NULL; - - ctx = ggml_init(params); - alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - - ggml_gallocr_alloc_graph(alloc, gf); - - struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); - static std::vector data_work; - data_work.resize(cplan.work_size); - cplan.work_data = data_work.data(); - - ggml_graph_compute(gf, &cplan); - - ggml_gallocr_free(alloc); - ggml_free(ctx); - return true; -} - -static void export_lora(struct export_lora_params * params) { - // load all loras - std::vector loras; - for (size_t i = 0; i < params->lora.size(); ++i) { - struct lora_data * lora = load_lora(¶ms->lora[i]); - if (lora != NULL) { - loras.push_back(lora); - } - } - if (loras.size() == 0) { - fprintf(stderr, "warning: no lora adapters will be applied.\n"); - } - - // open input file - struct llama_file fin(params->fn_model_base.c_str(), "rb"); - if (!fin.fp) { - die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str()); - } - - // open base model gguf, read tensors without their data - struct ggml_context * ctx_in; - struct gguf_init_params params_gguf; - params_gguf.no_alloc = true; - params_gguf.ctx = &ctx_in; - struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf); - - // create new gguf - struct gguf_context * gguf_out = gguf_init_empty(); - - // copy meta data from base model: kv and tensors - gguf_set_kv(gguf_out, gguf_in); - int n_tensors = gguf_get_n_tensors(gguf_in); - for (int i=0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(gguf_in, i); - struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name); - gguf_add_tensor(gguf_out, tensor); - } - - // create output file - struct llama_file fout(params->fn_model_out.c_str(), "wb"); - if (!fout.fp) { - die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str()); - } - - // write gguf meta data - std::vector meta; - meta.resize(gguf_get_meta_size(gguf_out)); - gguf_get_meta_data(gguf_out, meta.data()); - fout.write_raw(meta.data(), meta.size()); - - std::vector data; - std::vector padding; - for (int i=0; i < n_tensors; ++i) { - const char * name = gguf_get_tensor_name(gguf_in, i); - struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name); - - // read tensor data - data.resize(ggml_nbytes(tensor)); - tensor->data = data.data(); - size_t offset = gguf_get_tensor_offset(gguf_in, i); - fin.seek(offset + meta.size(), SEEK_SET); - fin.read_raw(data.data(), data.size()); - - // apply all loras - for (size_t k = 0; k < loras.size(); ++k) { - apply_lora(tensor, loras[k], params->n_threads); - } - - // write tensor data + padding - padding.clear(); - padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0); - - GGML_ASSERT(fout.tell() == offset + meta.size()); - // fout.seek(offset + meta.size(), SEEK_SET); - fout.write_raw(data.data(), data.size()); - fout.write_raw(padding.data(), padding.size()); - - if (i % 2 == 0) { - printf("."); - } - } + printf("\nexample usage:\n"); + printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); + printf("\nNOTE: output model is F16\n"); printf("\n"); - - // close gguf - gguf_free(gguf_out); - gguf_free(gguf_in); - - // free loras - for (size_t i = 0; i < loras.size(); ++i) { - free_lora(loras[i]); - } } int main(int argc, char ** argv) { - struct export_lora_params params = get_default_export_lora_params(); + gpt_params params; - if (!export_lora_params_parse(argc, argv, ¶ms)) { + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); return 1; } - export_lora(¶ms); + g_verbose = (params.verbosity == 1); + try { + lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads); + ctx.run_merge(); + } catch (const std::exception & err) { + fprintf(stderr, "%s\n", err.what()); + exit(EXIT_FAILURE); + } + + printf("done, output file is %s\n", params.lora_outfile.c_str()); return 0; } diff --git a/examples/finetune/CMakeLists.txt b/examples/finetune/CMakeLists.txt deleted file mode 100644 index 64afe6ddc..000000000 --- a/examples/finetune/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-finetune) -add_executable(${TARGET} finetune.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/finetune/README.md b/examples/finetune/README.md deleted file mode 100644 index 1c27df053..000000000 --- a/examples/finetune/README.md +++ /dev/null @@ -1,90 +0,0 @@ -# finetune - -Basic usage instructions: - -```bash -# get training data -wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt - -# finetune LORA adapter -./bin/llama-finetune \ - --model-base open-llama-3b-v2-q8_0.gguf \ - --checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \ - --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \ - --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \ - --train-data "shakespeare.txt" \ - --save-every 10 \ - --threads 6 --adam-iter 30 --batch 4 --ctx 64 \ - --use-checkpointing - -# predict -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin -``` - -**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`). -The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output. -So in above example after 10 iterations these files will be written: -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf -- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin -- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin - -After 10 more iterations: -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf -- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf -- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin -- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin - -Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter. - -llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`. -These LORA adapters can then be used by `llama-cli` together with the base model, like in the 'predict' example command above. - -In `llama-cli` you can also load multiple LORA adapters, which will then be mixed together. - -For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this: - -```bash -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ - --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \ - --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin -``` - -You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`. - -For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one: - -```bash -./bin/llama-cli -m open-llama-3b-v2-q8_0.gguf \ - --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \ - --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \ - --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin -``` - -The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values. - -Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime. -If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`. - -The default LORA rank can be specified with `--lora-r N`. -The LORA rank can be configured for each model tensor type separately with these command line options: - -```bash - --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4) - --rank-att-norm N LORA rank for attention norm tensor (default 1) - --rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1) - --rank-out-norm N LORA rank for output norm tensor (default 1) - --rank-tok-embd N LORA rank for token embeddings tensor (default 4) - --rank-out N LORA rank for output tensor (default 4) - --rank-wq N LORA rank for wq tensor (default 4) - --rank-wk N LORA rank for wk tensor (default 4) - --rank-wv N LORA rank for wv tensor (default 4) - --rank-wo N LORA rank for wo tensor (default 4) - --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4) - --rank-ffn_down N LORA rank for ffn_down tensor (default 4) - --rank-ffn_up N LORA rank for ffn_up tensor (default 4) -``` - -The LORA rank of 'norm' tensors should always be 1. - -To see all available options use `llama-finetune --help`. diff --git a/examples/finetune/convert_finetune_checkpoint_to_gguf.py b/examples/finetune/convert_finetune_checkpoint_to_gguf.py deleted file mode 100644 index 1b79d6995..000000000 --- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py +++ /dev/null @@ -1,487 +0,0 @@ -#!/usr/bin/env python3 -# finetune checkpoint --> gguf conversion - -import argparse -import gguf -import struct -import numpy as np -from pathlib import Path - -# gguf constants -LLM_KV_OPTIMIZER_TYPE = "optimizer.type" -LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" -LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs" -LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version" -LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count" -LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count" -LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count" -LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized" -LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss" -LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss" -LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count" -LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count" -LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k" -LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end" -LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count" - -LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments" -LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments" -LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values" - -LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters" -LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters" -LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients" -LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients" -LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction" -LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s" -LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y" - -LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model" -LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora" -LLM_KV_TRAINING_TYPE = "training.type" -LLM_KV_TRAINING_FILE_VERSION = "training.file_version" -LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count" -LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count" -LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count" - -LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd" -LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm" -LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output" -LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm" -LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q" -LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k" -LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v" -LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output" -LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm" -LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate" -LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down" -LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up" - -class Tensor: - def __init__(self, dtype='f', ne=None): - if ne is None: - ne = [] - self.dtype = dtype - self.ne = ne - self.nbytes = 0 - if self.dtype == 'f': - if len(self.ne) == 0: - self.nbytes = 0 - else: - self.nbytes = int(np.prod(self.ne)) * 4 - else: - raise ValueError(f"Unhandled data type '{self.dtype}'") - - def load(self, data, offset): - nd = struct.unpack(' 0 else []) - - self.lbfgs_x = Tensor('f', [self.nx]) - self.lbfgs_xp = Tensor('f', [self.nx]) - self.lbfgs_g = Tensor('f', [self.nx]) - self.lbfgs_gp = Tensor('f', [self.nx]) - self.lbfgs_d = Tensor('f', [self.nx]) - self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else []) - self.lbfgs_lmal = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lmys = Tensor('f', [self.lbfgs_m]) - self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m]) - self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m]) - - # forgot to save type in version 1: - # guess self.type from number of remaining bytes - size_type_0 = 12 + sum([t.max_storage_size() for t in - [self.adam_m, self.adam_v] - +([self.adam_pf] if (self.past > 0) else [])]) - size_type_1 = 24 + sum([t.max_storage_size() for t in - [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g, - self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf, - self.lbfgs_lmal, self.lbfgs_lmys, - self.lbfgs_lms, self.lbfgs_lmy] - +([self.lbfgs_pf] if (self.past > 0) else [])]) - # due to alignment padding the size might not by exact - # but the difference in size for both types is significant, - # so we can just use whichever is closest - remaining = len(data) - offset - if abs(remaining - size_type_0) < abs(remaining - size_type_1): - self.type = 0 - else: - self.type = 1 - - if self.type == 0: - offset = self.adam_m.load(data, offset) - offset = self.adam_v.load(data, offset) - offset = self.adam_pf.load(data,offset) - - self.adam_fx_best = struct.unpack(' 0: - self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES) - - elif self.type == 1: - gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS) - gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m) - gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best) - gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k) - gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end) - gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement) - - self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS) - self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS) - self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS) - self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS) - self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION) - if self.past > 0: - self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES) - self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA) - self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS) - self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S) - self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y) - else: - raise ValueError('Unknown optimizer type') - -class LoraParams: - def __init__(self): - pass - - def load(self, data, offset): - self.n_rank_attention_norm = struct.unpack(' -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -struct my_llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; - uint32_t n_embd = 4096; - uint32_t n_ff = 11008; - uint32_t n_head = 32; - uint32_t n_head_kv = 32; - uint32_t n_layer = 32; - - // float f_norm_eps = 1e-5f; // falcon - float f_norm_rms_eps = 1e-5f; // llama - - float rope_freq_base = 10000.0f; - float rope_freq_scale = 1.0f; - - uint32_t n_gqa() const { - return n_head/n_head_kv; - } - - uint32_t n_embd_head() const { - return n_embd/n_head; - } - - uint32_t n_embd_gqa() const { - return n_embd/n_gqa(); - } - - bool operator!=(const my_llama_hparams& other) const { - return memcmp(this, &other, sizeof(other)); - } -}; - -struct my_llama_layer { - // normalization - struct ggml_tensor * attention_norm; - - // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - - // normalization - struct ggml_tensor * ffn_norm; - - // ff - struct ggml_tensor * ffn_gate; // w1 - struct ggml_tensor * ffn_down; // w2 - struct ggml_tensor * ffn_up; // w3 -}; - -struct my_llama_model { - struct my_llama_hparams hparams; - - struct ggml_tensor * tok_embeddings; - - struct ggml_tensor * norm; - struct ggml_tensor * output; - - std::vector layers; -}; - -struct my_llama_lora_hparams { - uint32_t lora_r = 1; - uint32_t lora_alpha = 1; - uint32_t n_rank_attention_norm = 1; - uint32_t n_rank_wq = 4; - uint32_t n_rank_wk = 4; - uint32_t n_rank_wv = 4; - uint32_t n_rank_wo = 4; - uint32_t n_rank_ffn_norm = 1; - uint32_t n_rank_ffn_gate = 4; - uint32_t n_rank_ffn_down = 4; - uint32_t n_rank_ffn_up = 4; - uint32_t n_rank_tok_embeddings = 4; - uint32_t n_rank_norm = 1; - uint32_t n_rank_output = 4; - - bool operator!=(const my_llama_lora_hparams& other) const { - return memcmp(this, &other, sizeof(other)); - } -}; - -struct my_llama_lora_layer { - // normalization - struct ggml_tensor * attention_norm_a; - struct ggml_tensor * attention_norm_b; - - // attention - struct ggml_tensor * wq_a; - struct ggml_tensor * wq_b; - struct ggml_tensor * wk_a; - struct ggml_tensor * wk_b; - struct ggml_tensor * wv_a; - struct ggml_tensor * wv_b; - struct ggml_tensor * wo_a; - struct ggml_tensor * wo_b; - - // normalization - struct ggml_tensor * ffn_norm_a; - struct ggml_tensor * ffn_norm_b; - - // ff - struct ggml_tensor * ffn_gate_a; - struct ggml_tensor * ffn_gate_b; - struct ggml_tensor * ffn_down_a; - struct ggml_tensor * ffn_down_b; - struct ggml_tensor * ffn_up_a; - struct ggml_tensor * ffn_up_b; -}; - -struct my_llama_lora { - struct ggml_context * ctx = NULL; - ggml_backend_buffer_t data; - - my_llama_lora_hparams hparams; - - struct ggml_tensor * tok_embeddings_a; - struct ggml_tensor * tok_embeddings_b; - - struct ggml_tensor * norm_a; - struct ggml_tensor * norm_b; - struct ggml_tensor * output_a; - struct ggml_tensor * output_b; - - std::vector layers; -}; - -// gguf constants -static const char * LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"; -static const char * LLM_KV_TRAINING_TYPE = "training.type"; - -static const char * LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"; -static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"; -static const char * LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"; -static const char * LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"; - -// gguf constants (sync with gguf.py) - -static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; -static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; - -static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length"; -static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length"; -static const char * LLM_KV_BLOCK_COUNT = "%s.block_count"; -static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length"; -static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count"; -static const char * LLM_KV_ATTENTION_HEAD_COUNT_KV = "%s.attention.head_count_kv"; -static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon"; -static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count"; -static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp -static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear"; - -static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd"; -static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm"; -static const char * LLM_TENSOR_OUTPUT = "output"; -static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm"; -static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q"; -static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k"; -static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v"; -static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output"; -static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm"; -static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate"; -static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down"; -static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up"; - -static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab : %u\n", __func__, params->n_vocab); - printf("%s: n_ctx : %u\n", __func__, params->n_ctx); - printf("%s: n_embd : %u\n", __func__, params->n_embd); - printf("%s: n_ff : %u\n", __func__, params->n_ff); - printf("%s: n_head : %u\n", __func__, params->n_head); - printf("%s: n_head_kv : %u\n", __func__, params->n_head_kv); - printf("%s: n_layer : %u\n", __func__, params->n_layer); - printf("%s: norm_rms_eps : %f\n", __func__, params->f_norm_rms_eps); - printf("%s: rope_freq_base : %f\n", __func__, params->rope_freq_base); - printf("%s: rope_freq_scale : %f\n", __func__, params->rope_freq_scale); -} - -static void print_lora_params(struct my_llama_lora_hparams * params) { - printf("%s: n_rank_attention_norm : %u\n", __func__, params->n_rank_attention_norm); - printf("%s: n_rank_wq : %u\n", __func__, params->n_rank_wq); - printf("%s: n_rank_wk : %u\n", __func__, params->n_rank_wk); - printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); - printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); - printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); - printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate); - printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down); - printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up); - printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); - printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); - printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); -} - -#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ -{ \ - const std::string skey(key); \ - const int kid = gguf_find_key(ctx, skey.c_str()); \ - if (kid >= 0) { \ - enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \ - if (ktype != (type)) { \ - die_fmt("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype)); \ - } \ - (dst) = func(ctx, kid); \ - } else if (req) { \ - die_fmt("key not found in model: %s", skey.c_str()); \ - } \ -} - -static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_hparams * hparams, const char * expected_arch) { - std::string arch; - - GGUF_GET_KEY(ctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); - if (expected_arch != NULL) { - if (arch != expected_arch) { - printf("%s: arch=%s expected_arch=%s\n", __func__, arch.c_str(), expected_arch); - } - GGML_ASSERT(arch == expected_arch); - } - - std::vector keybuf; - keybuf.resize(512); - auto kv = [&arch, &keybuf](const char * key) -> const char * { - snprintf(keybuf.data(), keybuf.size(), key, arch.c_str()); - return keybuf.data(); - }; - - GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH)); - GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT)); - GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT)); - - // n_head_kv is optional, default to n_head - hparams->n_head_kv = hparams->n_head; - GGUF_GET_KEY(ctx, hparams->n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV)); - - float rope_freq_scale = 1.0f; - GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); - GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE)); - GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR)); - if (rope_freq_scale != 1.0f) { - hparams->rope_freq_scale = 1.0f / rope_freq_scale; - } -} - -static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) { - auto & hparams = model->hparams; - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - auto tn = [&tn_buf](const char * key) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", key); - return tn_buf.data(); - }; - auto tni = [&tn_buf](const char * key, int bid) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s.weight", s.c_str()); - return tn_buf.data(); - }; - - - // get parameters directly from gguf file - { - struct gguf_init_params params = { - /*.no_alloc = */ false, - /*.ctx = */ NULL, - }; - struct gguf_context * mctx = gguf_init_from_file(fn_model, params); - - load_model_hparams_gguf(mctx, &hparams, "llama"); - - gguf_free(mctx); - } - hparams.n_vocab = llama_n_vocab(input); - hparams.n_ctx = n_ctx; - - // get tensors from llama_model (possibly mmapped) - model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD)); - model->norm = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT_NORM)); - model->output = llama_get_model_tensor(input, tn(LLM_TENSOR_OUTPUT)); - - assert_shape_2d(model->tok_embeddings, hparams.n_embd, hparams.n_vocab); - assert_shape_1d(model->norm, hparams.n_embd); - assert_shape_2d(model->output, hparams.n_embd, hparams.n_vocab); - - model->layers.resize(hparams.n_layer); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - auto & layer = model->layers[i]; - - layer.attention_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_NORM, i)); - layer.wq = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_Q, i)); - layer.wk = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_K, i)); - layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); - layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); - layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); - layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); - layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); - layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); - - assert_shape_1d(layer.attention_norm, hparams.n_embd); - assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); - assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa()); - assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); - assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); - assert_shape_1d(layer.ffn_norm, hparams.n_embd); - assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff); - assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd); - assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff); - } -} - -static void set_param_lora(struct my_llama_lora * lora) { - const uint32_t n_layer = lora->layers.size(); - - struct ggml_context* ctx = lora->ctx; - - ggml_set_param(ctx, lora->tok_embeddings_a); - ggml_set_param(ctx, lora->tok_embeddings_b); - ggml_set_param(ctx, lora->norm_a); - ggml_set_param(ctx, lora->norm_b); - ggml_set_param(ctx, lora->output_a); - ggml_set_param(ctx, lora->output_b); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - - ggml_set_param(ctx, layer.attention_norm_a); - ggml_set_param(ctx, layer.attention_norm_b); - ggml_set_param(ctx, layer.wq_a); - ggml_set_param(ctx, layer.wq_b); - ggml_set_param(ctx, layer.wk_a); - ggml_set_param(ctx, layer.wk_b); - ggml_set_param(ctx, layer.wv_a); - ggml_set_param(ctx, layer.wv_b); - ggml_set_param(ctx, layer.wo_a); - ggml_set_param(ctx, layer.wo_b); - ggml_set_param(ctx, layer.ffn_norm_a); - ggml_set_param(ctx, layer.ffn_norm_b); - ggml_set_param(ctx, layer.ffn_gate_a); - ggml_set_param(ctx, layer.ffn_gate_b); - ggml_set_param(ctx, layer.ffn_down_a); - ggml_set_param(ctx, layer.ffn_down_b); - ggml_set_param(ctx, layer.ffn_up_a); - ggml_set_param(ctx, layer.ffn_up_b); - } -} - -static void init_lora(const struct my_llama_model * model, struct my_llama_lora * lora) { - const auto & lparams = lora->hparams; - - const uint32_t n_embd = model->hparams.n_embd; - const uint32_t n_embd_gqa = model->hparams.n_embd_gqa(); - const uint32_t n_layer = model->hparams.n_layer; - const uint32_t n_vocab = model->hparams.n_vocab; - const uint32_t n_ff = model->hparams.n_ff; - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); - return tn_buf.data(); - }; - auto tni = [&tn_buf](const char * key, const char * suffix, int bid) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); - return tn_buf.data(); - }; - - // context for lora tensors without their data - struct ggml_init_params ctx_lora_params; - ctx_lora_params.mem_size = ggml_tensor_overhead()*2*(6 + n_layer*18); - ctx_lora_params.mem_buffer = NULL; - ctx_lora_params.no_alloc = true; - - struct ggml_context * ctx = ggml_init(ctx_lora_params); - lora->ctx = ctx; - - lora->tok_embeddings_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_embd); - lora->tok_embeddings_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_tok_embeddings, n_vocab); - lora->norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, n_embd); - lora->norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_norm, 1); - lora->output_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_embd); - lora->output_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_output, n_vocab); - - ggml_set_name(lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_a")); - ggml_set_name(lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.lora_b")); - ggml_set_name(lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_a")); - ggml_set_name(lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.lora_b")); - ggml_set_name(lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.lora_a")); - ggml_set_name(lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.lora_b")); - - lora->layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - - layer.attention_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, n_embd); - layer.attention_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_attention_norm, 1); - - layer.wq_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); - layer.wq_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wq, n_embd); - layer.wk_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd); - layer.wk_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wk, n_embd_gqa); - layer.wv_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd); - layer.wv_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wv, n_embd_gqa); - layer.wo_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); - layer.wo_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_wo, n_embd); - - layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); - layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); - - layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd); - layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff); - layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff); - layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd); - layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd); - layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff); - - ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); - ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.wq_a, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_a", i)); - ggml_set_name(layer.wq_b, tni(LLM_TENSOR_ATTN_Q, ".weight.lora_b", i)); - ggml_set_name(layer.wk_a, tni(LLM_TENSOR_ATTN_K, ".weight.lora_a", i)); - ggml_set_name(layer.wk_b, tni(LLM_TENSOR_ATTN_K, ".weight.lora_b", i)); - ggml_set_name(layer.wv_a, tni(LLM_TENSOR_ATTN_V, ".weight.lora_a", i)); - ggml_set_name(layer.wv_b, tni(LLM_TENSOR_ATTN_V, ".weight.lora_b", i)); - ggml_set_name(layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_a", i)); - ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); - ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); - ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); - } - - set_param_lora(lora); - - // allocate data for lora tensors - lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); -} - -static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { - const uint32_t n_layer = lora->layers.size(); - - struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max); - - randomize_tensor_normal(lora->tok_embeddings_a, rnd); - ggml_set_zero(lora->tok_embeddings_b); - randomize_tensor_normal(lora->norm_a, rnd); - ggml_set_zero(lora->norm_b); - randomize_tensor_normal(lora->output_a, rnd); - ggml_set_zero(lora->output_b); - - for (uint32_t i = 0; i < n_layer; ++i) { - auto & layer = lora->layers[i]; - randomize_tensor_normal(layer.attention_norm_a, rnd); - ggml_set_zero(layer.attention_norm_b); - - randomize_tensor_normal(layer.wq_a, rnd); - ggml_set_zero(layer.wq_b); - randomize_tensor_normal(layer.wk_a, rnd); - ggml_set_zero(layer.wk_b); - randomize_tensor_normal(layer.wv_a, rnd); - ggml_set_zero(layer.wv_b); - randomize_tensor_normal(layer.wo_a, rnd); - ggml_set_zero(layer.wo_b); - - randomize_tensor_normal(layer.ffn_norm_a, rnd); - ggml_set_zero(layer.ffn_norm_b); - - randomize_tensor_normal(layer.ffn_gate_a, rnd); - ggml_set_zero(layer.ffn_gate_b); - randomize_tensor_normal(layer.ffn_down_a, rnd); - ggml_set_zero(layer.ffn_down_b); - randomize_tensor_normal(layer.ffn_up_a, rnd); - ggml_set_zero(layer.ffn_up_b); - } - - free_random_normal_distribution(rnd); -} - -static struct ggml_tensor * llama_build_lora_finetune_graphs( - struct my_llama_model * model, - struct my_llama_lora * lora, - ggml_gallocr_t alloc, - struct ggml_context * ctx, - struct ggml_cgraph * gf, - struct ggml_cgraph * gb, - struct ggml_cgraph * gb_tmp, - struct ggml_tensor * * logits, - struct ggml_tensor * tokens_input, - struct ggml_tensor * targets, - const int n_tokens, - const int n_batch, - const bool enable_flash_attn, - const bool enable_checkpointing, - const bool measure_only) { - - ggml_set_scratch(ctx, { 0, 0, nullptr, }); - const int n_past = 0; - const int N = n_tokens; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_head_kv = hparams.n_head_kv; - const int n_ff = hparams.n_ff; - const int n_rot = hparams.n_embd_head(); - const int n_embd_head = hparams.n_embd_head(); - const int n_embd_gqa = hparams.n_embd_gqa(); - - const float rms_norm_eps = hparams.f_norm_rms_eps; - const float rope_freq_base = hparams.rope_freq_base; - const float rope_freq_scale = hparams.rope_freq_scale; - - GGML_ASSERT((size_t) n_layer == lora->layers.size()); - - auto set_name = [](struct ggml_tensor * t, const char * n) { - ggml_set_name(t, n); - if (t->grad) { - ggml_format_name(t->grad, "%s->grad", n); - } - }; - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - ggml_set_input(KQ_pos); - - // rope has so much parameters that we make a custom function for it - auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] - (struct ggml_tensor * t) -> struct ggml_tensor * { - // not capturing these, to silcence warnings - const int rope_mode = 0; - - return ggml_rope_ext(ctx, - t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, - rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f - ); - }; - - set_name(tokens_input, "tokens_input"); - set_name(targets, "targets"); - - GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); - - auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) { - return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); - } else if (a->type == GGML_TYPE_F32) { - return ggml_add(ctx, a, b); - } else { - die_fmt("%s: Finetuning on tensors with type '%s' is not yet supported.\n", - __func__, ggml_type_name(a->type)); - } - }; - - struct ggml_tensor * tok_embeddings = add_to_f32(ctx, model->tok_embeddings, ggml_mul_mat(ctx, lora->tok_embeddings_a, lora->tok_embeddings_b)); - struct ggml_tensor * norm = add_to_f32(ctx, model->norm, ggml_mul_mat(ctx, lora->norm_a, lora->norm_b)); - struct ggml_tensor * output = add_to_f32(ctx, model->output, ggml_mul_mat(ctx, lora->output_a, lora->output_b)); - - struct ggml_tensor * t00 = ggml_reshape_1d(ctx, tokens_input, N*n_batch); set_name(t00, "t00"); assert_shape_1d(t00, N*n_batch); - struct ggml_tensor * t01 = ggml_get_rows(ctx, tok_embeddings, t00); set_name(t01, "t01"); assert_shape_2d(t01, n_embd, N*n_batch); - - struct ggml_tensor * cur = t01; - - std::vector checkpoints; - if (enable_checkpointing) { - checkpoints.push_back(tokens_input); - checkpoints.push_back(targets); - checkpoints.push_back(t00); - checkpoints.push_back(t01); - } - - const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head); - - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - struct my_llama_lora_layer & llayer = lora->layers[il]; - - struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); - struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); - struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); - struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); - struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); - struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); - struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b)); - struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b)); - struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b)); - - struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); - struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); - struct ggml_tensor * t04 = ggml_mul (ctx, t03, t02); set_name(t04, "t04"); assert_shape_2d(t04, n_embd, N*n_batch); - struct ggml_tensor * t05 = ggml_mul_mat (ctx, wq, t04); set_name(t05, "t05"); assert_shape_2d(t05, n_embd, N*n_batch); - struct ggml_tensor * t06 = ggml_reshape_4d (ctx, t05, n_embd_head, n_head, N, n_batch); set_name(t06, "t06"); assert_shape_4d(t06, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t07 = rope (t06); set_name(t07, "t07"); assert_shape_4d(t07, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t08 = ggml_mul_mat (ctx, wk, t04); set_name(t08, "t08"); assert_shape_2d(t08, n_embd_gqa, N*n_batch); - struct ggml_tensor * t09 = ggml_reshape_4d (ctx, t08, n_embd_head, n_head_kv, N, n_batch); set_name(t09, "t09"); assert_shape_4d(t09, n_embd_head, n_head_kv, N, n_batch); - struct ggml_tensor * t10 = rope (t09); set_name(t10, "t10"); assert_shape_4d(t10, n_embd_head, n_head_kv, N, n_batch); - - struct ggml_tensor * t11; - if (ggml_is_quantized(wv->type)) { - struct ggml_tensor * t11_1 = ggml_mul_mat (ctx, wv, t04); set_name(t11_1, "t11_1"); assert_shape_2d(t11_1, n_embd_gqa, N*n_batch); - struct ggml_tensor * t11_2 = ggml_transpose(ctx, t11_1); set_name(t11_2, "t11_2"); assert_shape_2d(t11_2, N*n_batch, n_embd_gqa); - t11 = ggml_cont (ctx, t11_2); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); - } else { - t11 = ggml_mul_mat (ctx, t04, wv); set_name(t11, "t11"); assert_shape_2d(t11, N*n_batch, n_embd_gqa); - } - - struct ggml_tensor * t12 = ggml_reshape_4d (ctx, t11, N, n_batch, n_embd_head, n_head_kv); set_name(t12, "t12"); assert_shape_4d(t12, N, n_batch, n_embd_head, n_head_kv); - struct ggml_tensor * t13 = ggml_permute (ctx, t07, 0, 2, 1, 3); set_name(t13, "t13"); assert_shape_4d(t13, n_embd_head, N, n_head, n_batch); - struct ggml_tensor * t14 = ggml_permute (ctx, t10, 0, 2, 1, 3); set_name(t14, "t14"); assert_shape_4d(t14, n_embd_head, N, n_head_kv, n_batch); - struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch); - struct ggml_tensor * t16; - if (enable_flash_attn) { - GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported"); - //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); - } else { - struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); - struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); - struct ggml_tensor * t16_2 = ggml_diag_mask_inf_inplace(ctx, t16_1, n_past); set_name(t16_2, "t16_2"); assert_shape_4d(t16_2, N, N, n_head, n_batch); - struct ggml_tensor * t16_3 = ggml_soft_max_inplace (ctx, t16_2); set_name(t16_3, "t16_3"); assert_shape_4d(t16_3, N, N, n_head, n_batch); - t16 = ggml_mul_mat(ctx, t15, t16_3); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); - } - struct ggml_tensor * t17 = ggml_permute (ctx, t16, 0, 2, 1, 3); set_name(t17, "t17"); assert_shape_4d(t17, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t18 = ggml_cont (ctx, t17); set_name(t18, "t18"); assert_shape_4d(t18, n_embd_head, n_head, N, n_batch); - struct ggml_tensor * t19 = ggml_reshape_2d (ctx, t18, n_embd, N*n_batch); set_name(t19, "t19"); assert_shape_2d(t19, n_embd, N*n_batch); - struct ggml_tensor * t20 = ggml_mul_mat (ctx, wo, t19); set_name(t20, "t20"); assert_shape_2d(t20, n_embd, N*n_batch); - struct ggml_tensor * t21 = ggml_add (ctx, t20, cur); set_name(t21, "t21"); assert_shape_2d(t21, n_embd, N*n_batch); - struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); - struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); - struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); - struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); - struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); - struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); - cur = t30; - if (enable_checkpointing) { - checkpoints.push_back(cur); - } - } - struct ggml_tensor * t31 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t31, "t31"); assert_shape_2d(t31, n_embd, N*n_batch); - struct ggml_tensor * t32 = ggml_repeat (ctx, norm, t31); set_name(t32, "t32"); assert_shape_2d(t32, n_embd, N*n_batch); - struct ggml_tensor * t33 = ggml_mul (ctx, t32, t31); set_name(t33, "t33"); assert_shape_2d(t33, n_embd, N*n_batch); - struct ggml_tensor * t34 = ggml_mul_mat (ctx, output, t33); set_name(t34, "t34"); assert_shape_2d(t34, n_vocab, N*n_batch); - struct ggml_tensor * t35 = ggml_reshape_3d (ctx, t34, n_vocab, N, n_batch); set_name(t35, "t35"); assert_shape_3d(t35, n_vocab, N, n_batch); - struct ggml_tensor * t36 = ggml_cross_entropy_loss(ctx, t35, targets); set_name(t36, "t36"); assert_shape_1d(t36, 1); - - if (enable_checkpointing) { - checkpoints.push_back(t31); - checkpoints.push_back(t32); - checkpoints.push_back(t33); - checkpoints.push_back(t34); - checkpoints.push_back(t35); - checkpoints.push_back(t36); - } - - ggml_build_forward_expand(gf, t36); - - if (enable_checkpointing) { - ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size()); - } else { - ggml_graph_cpy(gf, gb); - ggml_build_backward_expand(ctx, gf, gb, true); - } - - GGML_ASSERT(alloc != NULL); - - // make sure some tensors are not reallocated by inserting new temporary nodes depending on them - int n_leafs_before = gb->n_leafs; - int n_nodes_before = gb->n_nodes; - - // output tensors - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f)); - // input gradient - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); - GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); - ggml_set_input(t36->grad); - // KQ_pos - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); - - // make sure base model tensors data cannot be used in viewable operations - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f)); - for (int il = 0; il < n_layer; ++il) { - struct my_llama_layer & layer = model->layers[il]; - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f)); - } - - // allocating checkpoints in one block to reduce memory fragmentation - // note: they will be freed in reverse order - for (unsigned int i = 0; i < checkpoints.size(); ++i) { - if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { - ggml_set_input(checkpoints[i]); - } - } - - if (measure_only) { - ggml_gallocr_reserve(alloc, gb); - } else { - ggml_gallocr_alloc_graph(alloc, gb); - - // set KQ_pos - { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } - } - - // remove the additional nodes and leafs - for (int i = n_leafs_before; i < gb->n_leafs; ++i) { - gb->leafs[i] = NULL; - } - for (int i = n_nodes_before; i < gb->n_nodes; ++i) { - gb->nodes[i] = NULL; - } - gb->n_leafs = n_leafs_before; - gb->n_nodes = n_nodes_before; - - *logits = t35; - return t36; -} - -static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora) { - // NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read - - std::string arch; - - std::vector keybuf; - keybuf.resize(512); - - GGUF_GET_KEY(fctx, arch, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_GENERAL_ARCHITECTURE); - GGML_ASSERT(arch == "llama"); - - uint32_t ftype_u; - GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE); - GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32); - - struct my_llama_hparams hparams; - load_model_hparams_gguf(fctx, &hparams, arch.c_str()); - - // parameters that define tensor shapes must match - GGML_ASSERT(hparams.n_embd == model->hparams.n_embd); - GGML_ASSERT(hparams.n_ff == model->hparams.n_ff); - GGML_ASSERT(hparams.n_head == model->hparams.n_head); - GGML_ASSERT(hparams.n_head_kv == model->hparams.n_head_kv); - GGML_ASSERT(hparams.n_layer == model->hparams.n_layer); - - GGUF_GET_KEY(fctx, lora->hparams.n_rank_tok_embeddings, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_output, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_OUTPUT); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_attention_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wq, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_Q); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wk, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_K); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); - - init_lora(model, lora); - - copy_tensor_by_name(lora->tok_embeddings_a, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_a)); - copy_tensor_by_name(lora->tok_embeddings_b, f_ggml_ctx, ggml_get_name(lora->tok_embeddings_b)); - copy_tensor_by_name(lora->norm_a, f_ggml_ctx, ggml_get_name(lora->norm_a)); - copy_tensor_by_name(lora->norm_b, f_ggml_ctx, ggml_get_name(lora->norm_b)); - copy_tensor_by_name(lora->output_a, f_ggml_ctx, ggml_get_name(lora->output_a)); - copy_tensor_by_name(lora->output_b, f_ggml_ctx, ggml_get_name(lora->output_b)); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a)); - copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b)); - copy_tensor_by_name(layer.wq_a, f_ggml_ctx, ggml_get_name(layer.wq_a)); - copy_tensor_by_name(layer.wq_b, f_ggml_ctx, ggml_get_name(layer.wq_b)); - copy_tensor_by_name(layer.wk_a, f_ggml_ctx, ggml_get_name(layer.wk_a)); - copy_tensor_by_name(layer.wk_b, f_ggml_ctx, ggml_get_name(layer.wk_b)); - copy_tensor_by_name(layer.wv_a, f_ggml_ctx, ggml_get_name(layer.wv_a)); - copy_tensor_by_name(layer.wv_b, f_ggml_ctx, ggml_get_name(layer.wv_b)); - copy_tensor_by_name(layer.wo_a, f_ggml_ctx, ggml_get_name(layer.wo_a)); - copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); - copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); - copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); - copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a)); - copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b)); - copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a)); - copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b)); - copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a)); - copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b)); - } -} - -static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora) { - const char * arch = "llama"; - enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; - - std::vector keybuf; - keybuf.resize(512); - auto kv = [arch, &keybuf](const char * key) -> const char * { - snprintf(keybuf.data(), keybuf.size(), key, arch); - return keybuf.data(); - }; - - gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); - gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); - - gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx); - gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd); - gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff); - gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head); - gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV), model->hparams.n_head_kv); - gguf_set_val_u32(fctx, kv(LLM_KV_BLOCK_COUNT), model->hparams.n_layer); - gguf_set_val_u32(fctx, kv(LLM_KV_ROPE_DIMENSION_COUNT), model->hparams.n_embd_head()); - gguf_set_val_f32(fctx, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS), model->hparams.f_norm_rms_eps); - gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_FREQ_BASE), model->hparams.rope_freq_base); - gguf_set_val_f32(fctx, kv(LLM_KV_ROPE_SCALE_LINEAR), model->hparams.rope_freq_scale); - - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, lora->hparams.n_rank_tok_embeddings); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, lora->hparams.n_rank_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_OUTPUT, lora->hparams.n_rank_output); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, lora->hparams.n_rank_attention_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_Q, lora->hparams.n_rank_wq); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_K, lora->hparams.n_rank_wk); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up); - - gguf_add_tensor(fctx, lora->tok_embeddings_a); - gguf_add_tensor(fctx, lora->tok_embeddings_b); - gguf_add_tensor(fctx, lora->norm_a); - gguf_add_tensor(fctx, lora->norm_b); - gguf_add_tensor(fctx, lora->output_a); - gguf_add_tensor(fctx, lora->output_b); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - - gguf_add_tensor(fctx, layer.attention_norm_a); - gguf_add_tensor(fctx, layer.attention_norm_b); - gguf_add_tensor(fctx, layer.wq_a); - gguf_add_tensor(fctx, layer.wq_b); - gguf_add_tensor(fctx, layer.wk_a); - gguf_add_tensor(fctx, layer.wk_b); - gguf_add_tensor(fctx, layer.wv_a); - gguf_add_tensor(fctx, layer.wv_b); - gguf_add_tensor(fctx, layer.wo_a); - gguf_add_tensor(fctx, layer.wo_b); - gguf_add_tensor(fctx, layer.ffn_norm_a); - gguf_add_tensor(fctx, layer.ffn_norm_b); - gguf_add_tensor(fctx, layer.ffn_gate_a); - gguf_add_tensor(fctx, layer.ffn_gate_b); - gguf_add_tensor(fctx, layer.ffn_down_a); - gguf_add_tensor(fctx, layer.ffn_down_b); - gguf_add_tensor(fctx, layer.ffn_up_a); - gguf_add_tensor(fctx, layer.ffn_up_b); - } -} - -static void load_checkpoint_lora_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - std::string train_type = LLM_KV_TRAINING_TYPE_FINETUNE_LORA; - GGUF_GET_KEY(fctx, train_type, gguf_get_val_str, GGUF_TYPE_STRING, false, LLM_KV_TRAINING_TYPE); - GGML_ASSERT(train_type == LLM_KV_TRAINING_TYPE_FINETUNE_LORA); - - load_train_state_gguf(fctx, f_ggml_ctx, train); - load_llama_lora_gguf(fctx, f_ggml_ctx, model, lora); -} - -static void save_checkpoint_lora_gguf(struct gguf_context * fctx, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - gguf_set_val_str(fctx, LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA); - save_llama_lora_gguf(fctx, model, lora); - save_train_state_gguf(fctx, train); -} - -static bool load_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - struct ggml_context * f_ggml_ctx; - struct gguf_init_params params; - params.no_alloc = false; - params.ctx = &f_ggml_ctx; - struct gguf_context * fctx = gguf_init_from_file(filename, params); - if (fctx == NULL) { - return false; - } - - load_checkpoint_lora_gguf(fctx, f_ggml_ctx, model, lora, train); - - gguf_free(fctx); - return true; -} - -static void save_checkpoint_lora_file(const char * filename, struct my_llama_model * model, struct my_llama_lora * lora, struct train_state * train) { - printf("%s: saving to %s\n", __func__, filename); - struct gguf_context * fctx = gguf_init_empty(); - - save_checkpoint_lora_gguf(fctx, model, lora, train); - - // write file - const bool only_meta = false; - gguf_write_to_file(fctx, filename, only_meta); - gguf_free(fctx); -} - -struct llama_file { - // use FILE * so we don't have to re-open the file to mmap - FILE * fp; - size_t size; - - llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); - if (fp == NULL) { - size = 0; - } else { - seek(0, SEEK_END); - size = tell(); - seek(0, SEEK_SET); - } - } - - size_t tell() const { -#ifdef _WIN32 - __int64 ret = _ftelli64(fp); -#else - long ret = std::ftell(fp); -#endif - GGML_ASSERT(ret != -1); // this really shouldn't fail - return (size_t) ret; - } - - void seek(size_t offset, int whence) { -#ifdef _WIN32 - int ret = _fseeki64(fp, (__int64) offset, whence); -#else - int ret = std::fseek(fp, (long) offset, whence); -#endif - GGML_ASSERT(ret == 0); // same - } - - void read_raw(void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); - if (ferror(fp)) { - die_fmt("read error: %s", strerror(errno)); - } - if (ret != 1) { - die("unexpectedly reached end of file"); - } - } - - std::uint32_t read_u32() { - std::uint32_t ret; - read_raw(&ret, sizeof(ret)); - return ret; - } - - std::string read_string(std::uint32_t len) { - std::vector chars(len); - read_raw(chars.data(), len); - return std::string(chars.data(), len); - } - - void write_raw(const void * ptr, size_t size) { - if (size == 0) { - return; - } - errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); - if (ret != 1) { - die_fmt("write error: %s", strerror(errno)); - } - } - - void write_u32(std::uint32_t val) { - write_raw(&val, sizeof(val)); - } - - ~llama_file() { - if (fp) { - std::fclose(fp); - } - } -}; - -static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, const char * name) { - if (tensor == NULL) { - file->write_u32(0); - file->write_u32(0); - file->write_u32(GGML_TYPE_F32); - file->seek((0-file->tell()) & 31, SEEK_CUR); - return; - } - if (name == NULL) { - name = ggml_get_name(tensor); - } - uint32_t name_len = strlen(name); - uint32_t nd = ggml_n_dims(tensor); - uint32_t ne[4] = { (uint32_t)tensor->ne[0], - (uint32_t)tensor->ne[1], - (uint32_t)tensor->ne[2], - (uint32_t)tensor->ne[3] }; - file->write_u32(nd); - file->write_u32(name_len); - file->write_u32(tensor->type); - file->write_raw(ne, sizeof(ne[0]) * nd); - file->write_raw(name, name_len); - file->seek((0-file->tell()) & 31, SEEK_CUR); - file->write_raw(tensor->data, ggml_nbytes(tensor)); -} - -static void save_as_llama_lora(const char * filename, struct my_llama_lora * lora) { - printf("%s: saving to %s\n", __func__, filename); - struct llama_file file(filename, "wb"); - if (file.fp == NULL) { - return; - } - - std::vector tn_buf; - tn_buf.resize(GGML_MAX_NAME); - - auto tn = [&tn_buf](const char * key, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", key, suffix); - return tn_buf.data(); - }; - - auto tni = [&tn_buf](const char * key, int bid, const char * suffix) -> const char * { - snprintf(tn_buf.data(), tn_buf.size(), key, bid); - std::string s = tn_buf.data(); - snprintf(tn_buf.data(), tn_buf.size(), "%s%s", s.c_str(), suffix); - return tn_buf.data(); - }; - - // write_magic - file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic - file.write_u32(1); // version - // write_hparams - file.write_u32(lora->hparams.lora_r); - file.write_u32(lora->hparams.lora_alpha); - // write tensors - write_tensor(&file, lora->tok_embeddings_a, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraA")); - write_tensor(&file, lora->tok_embeddings_b, tn(LLM_TENSOR_TOKEN_EMBD, ".weight.loraB")); - write_tensor(&file, lora->norm_a, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraA")); - write_tensor(&file, lora->norm_b, tn(LLM_TENSOR_OUTPUT_NORM, ".weight.loraB")); - write_tensor(&file, lora->output_a, tn(LLM_TENSOR_OUTPUT, ".weight.loraA")); - write_tensor(&file, lora->output_b, tn(LLM_TENSOR_OUTPUT, ".weight.loraB")); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - write_tensor(&file, layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraA")); - write_tensor(&file, layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.wq_a, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraA")); - write_tensor(&file, layer.wq_b, tni(LLM_TENSOR_ATTN_Q, i, ".weight.loraB")); - write_tensor(&file, layer.wk_a, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraA")); - write_tensor(&file, layer.wk_b, tni(LLM_TENSOR_ATTN_K, i, ".weight.loraB")); - write_tensor(&file, layer.wv_a, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraA")); - write_tensor(&file, layer.wv_b, tni(LLM_TENSOR_ATTN_V, i, ".weight.loraB")); - write_tensor(&file, layer.wo_a, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraA")); - write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); - write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); - write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); - } -} - -struct train_params { - struct train_params_common common; - - const char * fn_model_base; - const char * fn_lora_out; - - bool only_write_lora; - - float f_norm_rms_eps; - float rope_freq_base; - float rope_freq_scale; - - bool custom_f_norm_rms_eps; - bool custom_rope_freq_base; - bool custom_rope_freq_scale; - - int32_t lora_r; - int32_t lora_alpha; - bool custom_lora_alpha; - - uint32_t n_rank_attention_norm; - uint32_t n_rank_wq; - uint32_t n_rank_wk; - uint32_t n_rank_wv; - uint32_t n_rank_wo; - uint32_t n_rank_ffn_norm; - uint32_t n_rank_ffn_gate; - uint32_t n_rank_ffn_down; - uint32_t n_rank_ffn_up; - uint32_t n_rank_tok_embeddings; - uint32_t n_rank_norm; - uint32_t n_rank_output; - - bool custom_n_rank_attention_norm; - bool custom_n_rank_wq; - bool custom_n_rank_wk; - bool custom_n_rank_wv; - bool custom_n_rank_wo; - bool custom_n_rank_ffn_norm; - bool custom_n_rank_ffn_gate; - bool custom_n_rank_ffn_down; - bool custom_n_rank_ffn_up; - bool custom_n_rank_tok_embeddings; - bool custom_n_rank_norm; - bool custom_n_rank_output; -}; - -static struct train_params get_default_train_params() { - struct train_params params; - params.common = get_default_train_params_common(); - params.fn_model_base = ""; - params.fn_lora_out = "ggml-lora-ITERATION-f32.gguf"; - - params.only_write_lora = false; - - params.f_norm_rms_eps = 1e-5f; - params.rope_freq_base = 10000.0f; - params.rope_freq_scale = 1.0f; - - params.custom_f_norm_rms_eps = false; - params.custom_rope_freq_base = false; - params.custom_rope_freq_scale = false; - - params.lora_r = 4; - params.lora_alpha = 4; - params.custom_lora_alpha = false; - - params.n_rank_attention_norm = 1; - params.n_rank_wq = 4; - params.n_rank_wk = 4; - params.n_rank_wv = 4; - params.n_rank_wo = 4; - params.n_rank_ffn_norm = 1; - params.n_rank_ffn_gate = 4; - params.n_rank_ffn_down = 4; - params.n_rank_ffn_up = 4; - params.n_rank_tok_embeddings = 4; - params.n_rank_norm = 1; - params.n_rank_output = 4; - - params.custom_n_rank_attention_norm = false; - params.custom_n_rank_wq = false; - params.custom_n_rank_wk = false; - params.custom_n_rank_wv = false; - params.custom_n_rank_wo = false; - params.custom_n_rank_ffn_norm = false; - params.custom_n_rank_ffn_gate = false; - params.custom_n_rank_ffn_down = false; - params.custom_n_rank_ffn_up = false; - params.custom_n_rank_tok_embeddings = false; - params.custom_n_rank_norm = false; - params.custom_n_rank_output = false; - - return params; -} - -static void train_print_usage(int argc, char ** argv, const struct train_params * params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - - fprintf(stderr, " --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base); - fprintf(stderr, " --lora-out FNAME path to save llama lora (default '%s')\n", params->fn_lora_out); - fprintf(stderr, " --only-write-lora only save llama lora, don't do any training. use this if you only want to convert a checkpoint to a lora adapter.\n"); - fprintf(stderr, " --norm-rms-eps F RMS-Norm epsilon value (default %f)\n", params->f_norm_rms_eps); - fprintf(stderr, " --rope-freq-base F Frequency base for ROPE (default %f)\n", params->rope_freq_base); - fprintf(stderr, " --rope-freq-scale F Frequency scale for ROPE (default %f)\n", params->rope_freq_scale); - fprintf(stderr, " --lora-alpha N LORA alpha : resulting LORA scaling is alpha/r. (default %d)\n", params->lora_alpha); - fprintf(stderr, " --lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default %d)\n", params->lora_r); - fprintf(stderr, " --rank-att-norm N LORA rank for attention norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-ffn-norm N LORA rank for feed-forward norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-out-norm N LORA rank for output norm tensor, overrides default rank. Norm tensors should generally have rank 1.\n"); - fprintf(stderr, " --rank-tok-embd N LORA rank for token embeddings tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-out N LORA rank for output tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wq N LORA rank for wq tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n"); - - print_common_train_usage(argc, argv, ¶ms->common); -} - -static bool train_params_parse(int argc, char ** argv, struct train_params * params) { - bool invalid_param = false; - std::string arg; - struct train_params default_params = get_default_train_params(); - const std::string arg_prefix = "--"; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - - if (consume_common_train_arg(argc, argv, &i, ¶ms->common, &invalid_param)) { - if (invalid_param) { - break; - } else if (params->common.print_usage) { - train_print_usage(argc, argv, &default_params); - exit(0); - } - } else if (arg == "--model-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_model_base = argv[i]; - } else if (arg == "--lora-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->fn_lora_out = argv[i]; - } else if (arg == "--only-write-lora") { - params->only_write_lora = true; - } else if (arg == "--norm-rms-eps") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->f_norm_rms_eps = std::stof(argv[i]); - params->custom_f_norm_rms_eps = true; - } else if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->rope_freq_base = std::stof(argv[i]); - params->custom_rope_freq_base = true; - } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->rope_freq_scale = std::stof(argv[i]); - params->custom_rope_freq_scale = true; - } else if (arg == "--lora-alpha") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->lora_alpha = std::stoi(argv[i]); - params->custom_lora_alpha = true; - } else if (arg == "--lora-r") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->lora_r = std::stoi(argv[i]); - } else if (arg == "--rank-att-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_attention_norm = std::stoi(argv[i]); - params->custom_n_rank_attention_norm = true; - } else if (arg == "--rank-ffn-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_norm = std::stoi(argv[i]); - params->custom_n_rank_ffn_norm = true; - } else if (arg == "--rank-out-norm") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_norm = std::stoi(argv[i]); - params->custom_n_rank_norm = true; - } else if (arg == "--rank-tok-embd") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_tok_embeddings = std::stoi(argv[i]); - params->custom_n_rank_tok_embeddings = true; - } else if (arg == "--rank-out") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_output = std::stoi(argv[i]); - params->custom_n_rank_output = true; - } else if (arg == "--rank-wq") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wq = std::stoi(argv[i]); - params->custom_n_rank_wq = true; - } else if (arg == "--rank-wk") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wk = std::stoi(argv[i]); - params->custom_n_rank_wk = true; - } else if (arg == "--rank-wv") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wv = std::stoi(argv[i]); - params->custom_n_rank_wv = true; - } else if (arg == "--rank-wo") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_wo = std::stoi(argv[i]); - params->custom_n_rank_wo = true; - } else if (arg == "--rank-ffn_gate") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_gate = std::stoi(argv[i]); - params->custom_n_rank_ffn_gate = true; - } else if (arg == "--rank-ffn_down") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_down = std::stoi(argv[i]); - params->custom_n_rank_ffn_down = true; - } else if (arg == "--rank-ffn_up") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_rank_ffn_up = std::stoi(argv[i]); - params->custom_n_rank_ffn_up = true; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - train_print_usage(argc, argv, &default_params); - exit(1); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - train_print_usage(argc, argv, &default_params); - exit(1); - } - finish_processing_train_args(¶ms->common); - return true; -} - -struct save_train_files_data { - const char * fn_checkpoint_out; - const char * fn_lora_out; - const char * pattern_fn_it; - const char * fn_latest; - struct my_llama_model * model; - struct my_llama_lora * lora; -}; - -static void save_train_files(void * vdata, struct train_state * train) { - struct save_train_files_data * data = (struct save_train_files_data *) vdata; - - int64_t iter = train->opt->iter; - - if (strlen(data->fn_checkpoint_out) > 0) { - save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->model, data->lora, train); - save_checkpoint_lora_file(get_train_filename(data->fn_checkpoint_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->model, data->lora, train); - } - if (strlen(data->fn_lora_out) > 0) { - save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, iter).c_str(), data->lora); - save_as_llama_lora(get_train_filename(data->fn_lora_out, data->pattern_fn_it, data->fn_latest, -1 ).c_str(), data->lora); - } -} - -static int64_t get_parameter_count(struct my_llama_lora* lora) { - int64_t nx = 0; - nx += ggml_nelements(lora->tok_embeddings_a); - nx += ggml_nelements(lora->tok_embeddings_b); - nx += ggml_nelements(lora->norm_a); - nx += ggml_nelements(lora->norm_b); - nx += ggml_nelements(lora->output_a); - nx += ggml_nelements(lora->output_b); - - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - nx += ggml_nelements(layer.attention_norm_a); - nx += ggml_nelements(layer.attention_norm_b); - nx += ggml_nelements(layer.wq_a); - nx += ggml_nelements(layer.wq_b); - nx += ggml_nelements(layer.wk_a); - nx += ggml_nelements(layer.wk_b); - nx += ggml_nelements(layer.wv_a); - nx += ggml_nelements(layer.wv_b); - nx += ggml_nelements(layer.wo_a); - nx += ggml_nelements(layer.wo_b); - nx += ggml_nelements(layer.ffn_norm_a); - nx += ggml_nelements(layer.ffn_norm_b); - nx += ggml_nelements(layer.ffn_gate_a); - nx += ggml_nelements(layer.ffn_gate_b); - nx += ggml_nelements(layer.ffn_down_a); - nx += ggml_nelements(layer.ffn_down_b); - nx += ggml_nelements(layer.ffn_up_a); - nx += ggml_nelements(layer.ffn_up_b); - } - return nx; -} - -int main(int argc, char ** argv) { - struct train_params params = get_default_train_params(); - - if (!train_params_parse(argc, argv, ¶ms)) { - return 1; - } - - if (params.common.seed == LLAMA_DEFAULT_SEED) { - params.common.seed = time(NULL); - } - printf("%s: seed: %u\n", __func__, params.common.seed); - srand(params.common.seed); - - struct llama_model_params llama_mparams = llama_model_default_params(); - llama_mparams.n_gpu_layers = params.common.n_gpu_layers; - llama_mparams.vocab_only = false; - - printf("%s: model base = '%s'\n", __func__, params.fn_model_base); - struct llama_model * lmodel = llama_load_model_from_file(params.fn_model_base, llama_mparams); - - struct llama_context_params llama_cparams = llama_context_default_params(); - struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_cparams); - - struct my_llama_model model; - init_model(lmodel, &model, params.fn_model_base, params.common.n_ctx); - - struct my_llama_lora lora; - - struct train_state * train = init_train_state(); - struct ggml_opt_context * opt = train->opt; - - // set params from command line - if (params.custom_f_norm_rms_eps) { - model.hparams.f_norm_rms_eps = params.f_norm_rms_eps; - } - if (params.custom_rope_freq_base) { - model.hparams.rope_freq_base = params.rope_freq_base; - } - if (params.custom_rope_freq_scale) { - model.hparams.rope_freq_scale = params.rope_freq_scale; - } - lora.hparams.lora_r = params.lora_r; - lora.hparams.lora_alpha = params.custom_lora_alpha ? params.lora_alpha : params.lora_r; - uint32_t n_rank_attention_norm = params.custom_n_rank_attention_norm ? params.n_rank_attention_norm : 1; - uint32_t n_rank_wq = params.custom_n_rank_wq ? params.n_rank_wq : params.lora_r; - uint32_t n_rank_wk = params.custom_n_rank_wk ? params.n_rank_wk : params.lora_r; - uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; - uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; - uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; - uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r; - uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r; - uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r; - uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; - uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; - uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; - lora.hparams.n_rank_attention_norm = n_rank_attention_norm; - lora.hparams.n_rank_wq = n_rank_wq; - lora.hparams.n_rank_wk = n_rank_wk; - lora.hparams.n_rank_wv = n_rank_wv; - lora.hparams.n_rank_wo = n_rank_wo; - lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; - lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate; - lora.hparams.n_rank_ffn_down = n_rank_ffn_down; - lora.hparams.n_rank_ffn_up = n_rank_ffn_up; - lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; - lora.hparams.n_rank_norm = n_rank_norm; - lora.hparams.n_rank_output = n_rank_output; - - // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); - opt->params.print_forward_graph = false; - opt->params.print_backward_graph = false; - opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; - opt->params.n_threads = params.common.n_threads; - opt->params.past = params.common.opt_past; - opt->params.delta = params.common.opt_delta; - opt->params.max_no_improvement = params.common.opt_max_no_improvement; - opt->params.n_gradient_accumulation = params.common.n_gradient_accumulation; - opt->params.adam.n_iter = params.common.adam_n_iter; - opt->params.adam.sched = 1.0f; - opt->params.adam.alpha = params.common.adam_alpha; - opt->params.adam.decay = params.common.adam_decay; - opt->params.adam.decay_min_ndim = params.common.adam_decay_min_ndim; - opt->params.adam.beta1 = params.common.adam_beta1; - opt->params.adam.beta2 = params.common.adam_beta2; - opt->params.adam.gclip = params.common.adam_gclip; - opt->params.adam.eps_f = params.common.adam_eps_f; - - printf("%s: init model\n", __func__); - bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train); - - if (existed) { - // overwrite last n_ctx with user provided n_ctx - if (params.common.custom_n_ctx) { - model.hparams.n_ctx = params.common.n_ctx; - } - - const bool opt_param_count_changed = ( - (lora.hparams.n_rank_attention_norm != n_rank_attention_norm) - || (lora.hparams.n_rank_wq != n_rank_wq) - || (lora.hparams.n_rank_wk != n_rank_wk) - || (lora.hparams.n_rank_wv != n_rank_wv) - || (lora.hparams.n_rank_wo != n_rank_wo) - || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) - || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate) - || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down) - || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up) - || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) - || (lora.hparams.n_rank_norm != n_rank_norm) - || (lora.hparams.n_rank_output != n_rank_output) - ); - - const bool opt_past_changed = opt->params.past != params.common.opt_past; - - if (opt_param_count_changed) { - print_lora_params(&lora.hparams); - die("Provided rank differs from checkpoint file. To use different rank start finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting."); - // need to discard previous optimizer gradient statistics and opt_init with new shapes - // TODO - } - if (opt_past_changed) { - die("Optimizer parameter '--opt-past N' differs from checkpoint file. To use different value finetune from scratch with empty input checkpoint, e.g --checkpoint-in ''. Aborting"); - // need to discard previous optimizer past function value statistics and opt_init with new shapes - // TODO - } - } else { // existed == false - init_lora(&model, &lora); - randomize_lora(&lora, params.common.seed, 0.0f, 1.0f, -1.0f, +1.0f); - if (!params.only_write_lora) { - ggml_opt_init(opt->ctx, opt, opt->params, get_parameter_count(&lora)); - } - } - opt->iter = train->train_its; - - print_params(&model.hparams); - print_lora_params(&lora.hparams); - printf("%s: total train_iterations %llu\n", __func__, (long long unsigned) train->train_its); - printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); - printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); - printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f)); - - if (params.only_write_lora) { - save_train_files_data save_data; - save_data.fn_checkpoint_out = ""; - save_data.fn_lora_out = params.fn_lora_out; - save_data.pattern_fn_it = params.common.pattern_fn_it; - save_data.fn_latest = params.common.fn_latest; - save_data.model = &model; - save_data.lora = &lora; - - save_train_files(&save_data, train); - - free_train_state(train); - ggml_free(lora.ctx); - llama_free(lctx); - llama_free_model(lmodel); - return 0; - } - - printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f)); - printf("%s: opt iter %d\n", __func__, opt->iter); - - int n_tokens = model.hparams.n_ctx; - int n_vocab = model.hparams.n_vocab; - int n_batch = params.common.n_batch; - - // context for input tensors without their data - struct ggml_init_params ctx_input_params = { - ggml_tensor_overhead() * 2, // mem_size - NULL, // mem_buffer - true, // no_alloc - }; - struct ggml_context * ctx_input = ggml_init(ctx_input_params); - - // the input tensors - struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); - struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - - // allocate input tensors - // measure required memory for input tensors - ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); - size_t max_input_size = ggml_backend_buffer_get_size(input_data); - printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); - - // context for compute tensors without their data - const size_t estimated_compute_size_wo_data = ( - 2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() + - (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true)) - ); - struct ggml_init_params ctx_compute_params = { - estimated_compute_size_wo_data, // mem_size - NULL, // mem_buffer - true, // no_alloc - }; - struct ggml_context * ctx_compute = NULL; - - struct ggml_tensor * loss = NULL; - struct ggml_tensor * logits = NULL; - - struct ggml_cgraph * gf = NULL; - struct ggml_cgraph * gb = NULL; - struct ggml_cgraph * gb_tmp = NULL; - - // measure required memory for compute tensors - size_t best_compute_size = SIZE_MAX; - enum ggml_cgraph_eval_order best_order = GGML_CGRAPH_EVAL_ORDER_COUNT; - // find best evaluation order - for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { - ctx_compute = ggml_init(ctx_compute_params); - ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gf->order = (enum ggml_cgraph_eval_order) order; - gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gb_tmp = params.common.use_checkpointing - ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) - : NULL; - loss = llama_build_lora_finetune_graphs( - &model, &lora, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.common.use_flash, - params.common.use_checkpointing, - true - ); - size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer - if (max_compute_size < best_compute_size) { - best_compute_size = max_compute_size; - best_order = gf->order; - } - ggml_gallocr_free(alloc); - ggml_free(ctx_compute); - } - size_t max_compute_size = best_compute_size; - printf("%s: compute_size = %zu bytes (%.1f MB)\n", __func__, max_compute_size, (float) max_compute_size / (1024.0f*1024.0f)); - printf("%s: evaluation order = %s\n", __func__, - (best_order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? "LEFT_TO_RIGHT" : - (best_order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? "RIGHT_TO_LEFT" : - "invalid"); - - // allocate compute tensors - ctx_compute = ggml_init(ctx_compute_params); - ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gf->order = best_order; - gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); - gb_tmp = params.common.use_checkpointing - ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true) - : NULL; - loss = llama_build_lora_finetune_graphs( - &model, &lora, alloc, ctx_compute, - gf, gb, gb_tmp, - &logits, tokens_input, target_probs, - n_tokens, n_batch, - params.common.use_flash, - params.common.use_checkpointing, - false - ); - - // tokenize data - std::vector train_tokens; - std::vector train_samples_begin; - std::vector train_samples_size; - printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); - printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str()); - printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false"); - tokenize_file(lctx, - params.common.fn_train_data, - params.common.sample_start, - params.common.include_sample_start, - params.common.overlapping_samples, - n_tokens, - train_tokens, - train_samples_begin, - train_samples_size); - GGML_ASSERT(train_samples_begin.size() == train_samples_size.size()); - - printf("%s: number of training tokens: %zu\n", __func__, train_tokens.size()); - - std::vector token_noccurs; - token_noccurs.resize(model.hparams.n_vocab, 0); - for (unsigned int i = 0; i < train_tokens.size(); ++i) { - ++token_noccurs[train_tokens[i]]; - } - int n_unique_tokens = 0; - for (unsigned int i = 0; i < token_noccurs.size(); ++i) { - if (token_noccurs[i] == 0) continue; - ++n_unique_tokens; - } - printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); - - size_t shuffle_samples_hash = compute_samples_hash(params.common.fn_train_data, train_samples_begin.data(), train_samples_size.data(), train_samples_size.size()); - const bool changed_train_data = (shuffle_samples_hash != train->shuffle_samples_hash) || (train->shuffle_sample_count != train_samples_size.size()); - if (changed_train_data) { - printf("%s: train data seems to have changed. restarting shuffled epoch.\n", __func__); - } - if (params.common.force_reshuffle) { - printf("%s: forced reshuffling of data. restarting with newly shuffled epoch.\n", __func__); - } - if ((train->shuffle_rng_state_current == "") || changed_train_data || params.common.force_reshuffle) { - train->shuffle_rng_state_current = mt19937_seed_to_state(params.common.seed); - train->shuffle_sample_count = train_samples_size.size(); - train->shuffle_next_sample = 0; - train->shuffle_samples_hash = shuffle_samples_hash; - } - std::vector train_shuffled_samples_offs; - std::vector train_shuffled_samples_begin; - std::vector train_shuffled_samples_size; - train_shuffled_samples_offs.resize(train_samples_begin.size()); - train_shuffled_samples_begin.resize(train_samples_begin.size()); - train_shuffled_samples_size.resize(train_samples_size.size()); - train->shuffle_rng_state_next = shuffle_samples( - train->shuffle_rng_state_current, - train_shuffled_samples_offs.data(), - train_shuffled_samples_begin.data(), - train_shuffled_samples_size.data(), - train_samples_begin.data(), - train_samples_size.data(), - train_samples_size.size()); - - printf("%s: begin training\n", __func__); - - save_train_files_data save_data; - save_data.fn_checkpoint_out = params.common.fn_checkpoint_out; - save_data.fn_lora_out = params.fn_lora_out; - save_data.pattern_fn_it = params.common.pattern_fn_it; - save_data.fn_latest = params.common.fn_latest; - save_data.model = &model; - save_data.lora = &lora; - - struct train_opt_callback_data opt_cb_data; - opt_cb_data.params = ¶ms.common; - opt_cb_data.train = train; - opt_cb_data.save_cb = &save_train_files; - opt_cb_data.save_data = &save_data; - opt_cb_data.lctx = lctx; - opt_cb_data.last_save_iter = opt->iter; - opt_cb_data.tokens_data = train_tokens.data(); - opt_cb_data.tokens_size = train_tokens.size(); - opt_cb_data.samples_begin = train_samples_begin.data(); - opt_cb_data.samples_size = train_samples_size.data(); - opt_cb_data.shuffled_samples_offs = train_shuffled_samples_offs.data(); - opt_cb_data.shuffled_samples_begin = train_shuffled_samples_begin.data(); - opt_cb_data.shuffled_samples_size = train_shuffled_samples_size.data(); - opt_cb_data.samples_count = train_samples_size.size(); - opt_cb_data.tokens_input = tokens_input; - opt_cb_data.target_probs = target_probs; - opt_cb_data.first_iter = opt->iter; - opt_cb_data.first_epoch = train->train_epochs; - opt_cb_data.iter_at_last_epoch = -1; - opt_cb_data.last_time = ggml_time_ms(); - opt_cb_data.millis_per_iter = 0.0; - - // measure required memory for work buffer - size_t max_work_size = ggml_graph_plan(gb, params.common.n_threads).work_size + GGML_OBJECT_SIZE; - printf("%s: work_size = %zu bytes (%.1f MB)\n", __func__, max_work_size, (float) max_work_size / (1024.0f*1024.0f)); - - // context for work buffer - struct ggml_init_params ctx_work_params = { - max_work_size, // mem_size - NULL, // mem_buffer - false, // no_alloc - }; - struct ggml_context * ctx_work = ggml_init(ctx_work_params); - - int64_t t0 = ggml_time_ms(); - - ggml_opt_resume_g(ctx_work, opt, loss, gf, gb, &train_opt_callback, (void *) &opt_cb_data); - - ggml_free(ctx_work); - ggml_free(ctx_compute); - ggml_free(ctx_input); - ggml_gallocr_free(alloc); - - - int64_t t1 = ggml_time_ms(); - printf("%s: total training time: ", __func__); - print_duration((double) (t1 - t0)); - printf("\n"); - - int new_iters = opt->iter - opt_cb_data.last_save_iter; - if (new_iters > 0) { - train->train_its += new_iters; - train->train_tokens += new_iters * opt->params.n_gradient_accumulation * n_batch * n_tokens; - - save_train_files(&save_data, train); - opt_cb_data.last_save_iter = opt->iter; - } - - ggml_free(opt->ctx); - free_train_state(train); - ggml_free(lora.ctx); - llama_free(lctx); - llama_free_model(lmodel); - return 0; -} diff --git a/examples/finetune/finetune.sh b/examples/finetune/finetune.sh deleted file mode 100644 index e3cc7f271..000000000 --- a/examples/finetune/finetune.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -cd `dirname $0` -cd ../.. - -EXE="./llama-finetune" - -if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi -if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi - -# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses. -MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing. - -while getopts "dg" opt; do - case $opt in - d) - DEBUGGER="gdb --args" - ;; - g) - EXE="./build/bin/Release/finetune" - GPUARG="--gpu-layers 25" - ;; - esac -done - -$DEBUGGER $EXE \ - --model-base $MODEL \ - $GPUARG \ - --checkpoint-in chk-ol3b-shakespeare-LATEST.gguf \ - --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \ - --lora-out lora-ol3b-shakespeare-ITERATION.bin \ - --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \ - --save-every 10 \ - --threads 10 --adam-iter 30 --batch 4 --ctx 64 \ - --use-checkpointing diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index dd53ba9b1..48a705e15 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -16,20 +16,25 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st auto decoded = decode_utf8(input_str, {}); const auto & code_points = decoded.first; + const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); + llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar); + size_t pos = 0; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - auto prev_stacks = grammar->stacks; - llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); - if (grammar->stacks.empty()) { + const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy + + llama_grammar_accept(rules, prev_stacks, *it, cur_stacks); + + if (cur_stacks.empty()) { error_pos = pos; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; - grammar->stacks = prev_stacks; + cur_stacks = prev_stacks; return false; } ++pos; } - for (const auto & stack : grammar->stacks) { + for (const auto & stack : cur_stacks) { if (stack.empty()) { return true; } diff --git a/examples/gguf-hash/README.md b/examples/gguf-hash/README.md index 2320ffd81..9871651e3 100644 --- a/examples/gguf-hash/README.md +++ b/examples/gguf-hash/README.md @@ -201,6 +201,6 @@ Verification results for test.gguf.manifest - Success These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs) -- https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash) +- https://github.com/Cyan4973/xxHash - https://github.com/clibs/sha1/ - https://github.com/jb55/sha256.c diff --git a/examples/gguf-hash/deps/xxhash/clib.json b/examples/gguf-hash/deps/xxhash/clib.json index 0048c74dc..242343c5d 100644 --- a/examples/gguf-hash/deps/xxhash/clib.json +++ b/examples/gguf-hash/deps/xxhash/clib.json @@ -1,7 +1,7 @@ { "name": "xxhash", "version": "0.8.2", - "repo": "mofosyne/xxhash", + "repo": "Cyan4973/xxhash", "description": "Extremely fast non-cryptographic hash algorithm", "keywords": ["xxhash", "hashing"], "license": "BSD-2-Clause", diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 575143771..7498f85ef 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -92,6 +92,11 @@ static bool gguf_ex_read_0(const std::string & fname) { struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); + if (!ctx) { + fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname.c_str()); + return false; + } + printf("%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 29602881a..bb5faec94 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -1,6 +1,6 @@ # llama.cpp/examples/imatrix -Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models. +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models. More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 ## Usage diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 574f5ed9c..58814b96e 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } else if (e.values.size() != (size_t)src1->ne[0]*n_as) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); - exit(1); //GGML_ASSERT(false); + exit(1); //GGML_ABORT("fatal error"); } if (m_params.verbosity > 1) { printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); @@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } else if (e.values.size() != (size_t)src1->ne[0]) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ASSERT(false); + exit(1); //GGML_ABORT("fatal error"); } ++e.ncall; if (m_params.verbosity > 1) { @@ -611,10 +611,10 @@ int main(int argc, char ** argv) { params.warmup = false; // init - llama_model * model; - llama_context * ctx; + llama_init_result llama_init = llama_init_from_gpt_params(params); - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); return 1; diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index dc93d2301..92d630b15 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -179,7 +179,10 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + model = llama_init.model; + ctx = llama_init.context; if (model == NULL) { LOG_TEE("%s: error: unable to load model\n", __func__); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index d641a9f12..42918bfc7 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -23,6 +23,18 @@ #include "ggml-cuda.h" #include "ggml-sycl.h" +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + // utils static uint64_t get_time_ns() { using clock = std::chrono::high_resolution_clock; @@ -92,6 +104,27 @@ static std::string get_cpu_info() { } fclose(f); } +#elif defined(_WIN32) + HKEY hKey; + if (RegOpenKeyEx(HKEY_LOCAL_MACHINE, + TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), + 0, + KEY_READ, + &hKey) != ERROR_SUCCESS) { + // fail to open registry key + return ""; + } + char cpu_brand[256]; + DWORD cpu_brand_size = sizeof(cpu_brand); + if (RegQueryValueExA(hKey, + TEXT("ProcessorNameString"), + NULL, + NULL, + (LPBYTE)cpu_brand, + &cpu_brand_size) == ERROR_SUCCESS) { + id.assign(cpu_brand, cpu_brand_size); + } + RegCloseKey(hKey); #endif // TODO: other platforms return id; @@ -120,6 +153,17 @@ static std::string get_gpu_info() { id += "/"; } } +#endif +#ifdef GGML_USE_CANN + uint32_t count = ggml_backend_cann_get_device_count(); + for (uint32_t i = 0; i < count; i++) { + char buf[128]; + ggml_backend_cann_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } #endif // TODO: other backends return id; @@ -135,7 +179,7 @@ static const char * output_format_str(output_formats format) { case JSON: return "json"; case MARKDOWN: return "md"; case SQL: return "sql"; - default: GGML_ASSERT(!"invalid output format"); + default: GGML_ABORT("invalid output format"); } } @@ -161,7 +205,7 @@ static const char * split_mode_str(llama_split_mode mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_LAYER: return "layer"; case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ASSERT(!"invalid split mode"); + default: GGML_ABORT("invalid split mode"); } } @@ -1311,7 +1355,7 @@ static std::unique_ptr create_printer(output_formats format) { case SQL: return std::unique_ptr(new sql_printer()); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } int main(int argc, char ** argv) { diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 92a6b16b1..2aafe2316 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -409,7 +409,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { - return env->NewStringUTF(""); + return nullptr; } auto new_token_chars = llama_token_to_piece(context, new_token_id); diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 2a3f9f758..58c32ca53 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -26,11 +26,12 @@ actor LlamaContext { private var context: OpaquePointer private var batch: llama_batch private var tokens_list: [llama_token] + var is_done: Bool = false /// This variable is used to store temporarily invalid cchars private var temporary_invalid_cchars: [CChar] - var n_len: Int32 = 64 + var n_len: Int32 = 1024 var n_cur: Int32 = 0 var n_decode: Int32 = 0 @@ -160,6 +161,7 @@ actor LlamaContext { if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") + is_done = true let new_token_str = String(cString: temporary_invalid_cchars + [0]) temporary_invalid_cchars.removeAll() return new_token_str diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 2c1e3f61b..b8f6a31d5 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -132,7 +132,7 @@ class LlamaState: ObservableObject { messageLog += "\(text)" Task.detached { - while await llamaContext.n_cur < llamaContext.n_len { + while await !llamaContext.is_done { let result = await llamaContext.completion_loop() await MainActor.run { self.messageLog += "\(result)" diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index e9fa73acb..bbf5fec58 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -36,3 +36,10 @@ set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET llama-minicpmv-cli) +add_executable(${TARGET} minicpmv-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md new file mode 100644 index 000000000..4affc1d0f --- /dev/null +++ b/examples/llava/README-minicpmv2.5.md @@ -0,0 +1,99 @@ +## MiniCPM-Llama3-V 2.5 + +### Prepare models and code + +Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder. + +Clone llama.cpp: +```bash +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +``` + +### Usage + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) + +```bash +python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 +python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 +python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model + +# quantize int4 version +./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + +Build for Linux or Mac + +```bash +make +make llama-minicpmv-cli +``` + +Inference on Linux or Mac +``` +# run f16 version +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run quantized int4 version +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# or run in interactive mode +./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +``` + +### Android + +#### Build on Android device using Termux +We found that build on Android device would bring better runtime performance, so we recommend to build on device. + +[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). + +Install tools in Termux: +``` +apt update && apt upgrade -y +apt install git make cmake +``` + +It's recommended to move your model inside the `~/` directory for best performance: +``` +cd storage/downloads +mv model.gguf ~/ +``` + +#### Building the Project using Android NDK +Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. + +Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: + +```bash +mkdir build-android +cd build-android +export NDK=/your_ndk_path +cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. +make +``` + +Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). + +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ +$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +``` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d6882eec3..54aa822c9 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -16,6 +16,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_CANN +#include "ggml-cann.h" +#endif + #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -70,26 +74,27 @@ static std::string format(const char * fmt, ...) { // key constants // -#define KEY_FTYPE "general.file_type" -#define KEY_NAME "general.name" -#define KEY_DESCRIPTION "general.description" -#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" -#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" -#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" -#define KEY_USE_GELU "clip.use_gelu" -#define KEY_N_EMBD "clip.%s.embedding_length" -#define KEY_N_FF "clip.%s.feed_forward_length" -#define KEY_N_BLOCK "clip.%s.block_count" -#define KEY_N_HEAD "clip.%s.attention.head_count" -#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" -#define KEY_PROJ_DIM "clip.%s.projection_dim" -#define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" -#define KEY_IMAGE_SIZE "clip.vision.image_size" -#define KEY_PATCH_SIZE "clip.vision.patch_size" -#define KEY_IMAGE_MEAN "clip.vision.image_mean" -#define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_FTYPE "general.file_type" +#define KEY_NAME "general.name" +#define KEY_DESCRIPTION "general.description" +#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" +#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" +#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" +#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" +#define KEY_USE_GELU "clip.use_gelu" +#define KEY_N_EMBD "clip.%s.embedding_length" +#define KEY_N_FF "clip.%s.feed_forward_length" +#define KEY_N_BLOCK "clip.%s.block_count" +#define KEY_N_HEAD "clip.%s.attention.head_count" +#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" +#define KEY_PROJ_DIM "clip.%s.projection_dim" +#define KEY_TOKENS "tokenizer.ggml.tokens" +#define KEY_N_POSITIONS "clip.text.context_length" +#define KEY_IMAGE_SIZE "clip.vision.image_size" +#define KEY_PATCH_SIZE "clip.vision.patch_size" +#define KEY_IMAGE_MEAN "clip.vision.image_mean" +#define KEY_IMAGE_STD "clip.vision.image_std" +#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -123,12 +128,20 @@ static std::string format(const char * fmt, ...) { #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" +#define TN_MINICPMV_QUERY "resampler.query" +#define TN_MINICPMV_PROJ "resampler.proj.weight" +#define TN_MINICPMV_KV_PROJ "resampler.kv.weight" +#define TN_MINICPMV_ATTN "resampler.attn.%s.%s" +#define TN_MINICPMV_LN "resampler.ln_%s.%s" + enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, + PROJECTOR_TYPE_RESAMPLER, PROJECTOR_TYPE_UNKNOWN, }; @@ -136,6 +149,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, + { PROJECTOR_TYPE_RESAMPLER, "resampler"}, }; @@ -196,17 +210,14 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int } static void replace_all(std::string & s, const std::string & search, const std::string & replace) { - std::string result; - for (size_t pos = 0; ; pos += search.length()) { - auto new_pos = s.find(search, pos); - if (new_pos == std::string::npos) { - result += s.substr(pos, s.size() - pos); - break; - } - result += s.substr(pos, new_pos - pos) + replace; - pos = new_pos; + if (search.empty()) { + return; // Avoid infinite loop if 'search' is an empty string + } + size_t pos = 0; + while ((pos = s.find(search, pos)) != std::string::npos) { + s.replace(pos, search.length(), replace); + pos += replace.length(); } - s = std::move(result); } static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { @@ -488,12 +499,33 @@ struct clip_vision_model { struct ggml_tensor * mm_model_mlp_2_b; struct ggml_tensor * mm_model_peg_0_w; struct ggml_tensor * mm_model_peg_0_b; + + // MINICPMV projection + struct ggml_tensor * mm_model_pos_embed_k; + struct ggml_tensor * mm_model_query; + struct ggml_tensor * mm_model_proj; + struct ggml_tensor * mm_model_kv_proj; + struct ggml_tensor * mm_model_attn_q_w; + struct ggml_tensor * mm_model_attn_q_b; + struct ggml_tensor * mm_model_attn_k_w; + struct ggml_tensor * mm_model_attn_k_b; + struct ggml_tensor * mm_model_attn_v_w; + struct ggml_tensor * mm_model_attn_v_b; + struct ggml_tensor * mm_model_attn_o_w; + struct ggml_tensor * mm_model_attn_o_b; + struct ggml_tensor * mm_model_ln_q_w; + struct ggml_tensor * mm_model_ln_q_b; + struct ggml_tensor * mm_model_ln_kv_w; + struct ggml_tensor * mm_model_ln_kv_b; + struct ggml_tensor * mm_model_ln_post_w; + struct ggml_tensor * mm_model_ln_post_b; }; struct clip_ctx { bool has_text_encoder = false; bool has_vision_encoder = false; bool has_llava_projector = false; + bool has_minicpmv_projector = false; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -518,9 +550,11 @@ struct clip_ctx { ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; + + struct clip_image_size * load_image_size; }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -529,20 +563,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + if (ctx->has_minicpmv_projector) { + if (load_image_size == nullptr) { + load_image_size = clip_image_size_init(); + } + LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); + image_size_width = load_image_size->width; + image_size_height = load_image_size->height; + if (is_inf) { + image_size_width = imgs->data->nx; + image_size_height = imgs->data->ny; + } + } const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); - const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; - const int n_layer = hparams.n_layer; + int n_layer = hparams.n_layer; const float eps = hparams.eps; const int batch_size = imgs->size; - if (ctx->has_llava_projector) { + if (ctx->has_llava_projector || ctx->has_minicpmv_projector) { GGML_ASSERT(batch_size == 1); } @@ -555,7 +602,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); - struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); + struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); ggml_set_name(inp_raw, "inp_raw"); ggml_set_input(inp_raw); @@ -568,19 +615,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); inp = ggml_add(ctx0, inp, model.patch_bias); } - - // concat class_embeddings and patch_embeddings struct ggml_tensor * embeddings = inp; - if (ctx->has_class_embedding) { - embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - } + struct ggml_tensor * pos_embed = nullptr; + if (ctx->has_llava_projector) { + // concat class_embeddings and patch_embeddings + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } + } struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); ggml_set_name(positions, "positions"); @@ -589,6 +638,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); + if (ctx->has_minicpmv_projector) { + int pos_w = image_size_width/patch_size; + int pos_h = image_size_height/patch_size; + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); + ggml_set_name(pos_embed, "pos_embed"); + ggml_set_input(pos_embed); + } + // pre-layernorm if (ctx->has_pre_norm) { embeddings = ggml_norm(ctx0, embeddings, eps); @@ -598,6 +655,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // loop over layers + if (ctx->has_minicpmv_projector) { + n_layer += 1; + } for (int il = 0; il < n_layer - 1; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states @@ -687,7 +747,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // llava projector - { + if (ctx->has_llava_projector) { embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches); @@ -864,6 +924,65 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); embeddings = peg_0; } + else { + GGML_ABORT("fatal error"); + } + } + // minicpmv projector + else if (ctx->has_minicpmv_projector) + { + if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + struct ggml_tensor * q = model.mm_model_query; + { // layernorm + q = ggml_norm(ctx0, q, eps); + q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + } + struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + { // layernorm + v = ggml_norm(ctx0, v, eps); + v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b); + } + struct ggml_tensor * k; + { // position + // q = ggml_add(ctx0, q, model.mm_model_pos_embed); + k = ggml_add(ctx0, v, pos_embed); + } + + { // attention + const int hidden_size = 4096; + const int d_head = 128; + const int n_head = hidden_size/d_head; + const int num_query = 96; + + struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); + Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); + struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b); + struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b); + // permute + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size); + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_soft_max_inplace(ctx0, KQ); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size); + + embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b); + } + { // layernorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b); + } + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + } else { GGML_ASSERT(false); } @@ -1001,6 +1120,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: CLIP using Metal backend\n", __func__); #endif +#ifdef GGML_USE_CANN + new_clip->backend = ggml_backend_cann_init(0); + LOG_TEE("%s: CLIP using CANN backend\n", __func__); +#endif + if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); @@ -1020,7 +1144,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); } - GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ); + if (idx != -1) { + new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx); + } + + // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search + GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(!new_clip->has_text_encoder); @@ -1031,6 +1161,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } @@ -1272,6 +1403,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); } + else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) { + // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD); + vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K); + vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY); + vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ); + vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ); + vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight")); + vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight")); + vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight")); + vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias")); + vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias")); + vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias")); + vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight")); + vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias")); + vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight")); + vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias")); + vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight")); + vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias")); + vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); + vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -1310,7 +1462,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; - ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); + ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); @@ -1319,6 +1471,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { return new_clip; } +void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { + ctx_clip->load_image_size = load_image_size; +} + +struct clip_image_size * clip_image_size_init() { + struct clip_image_size * load_image_size = new struct clip_image_size(); + load_image_size->width = 448; + load_image_size->height = 448; + return load_image_size; +} + struct clip_image_u8 * clip_image_u8_init() { return new clip_image_u8(); } @@ -1589,9 +1752,184 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im return patches; } +static int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); +} + +static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.first; + int height = original_size.second; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + int best_width = ensure_divide(width, patch_size); + int best_height = ensure_divide(height, patch_size); + return std::make_pair(best_width, best_height); +} + +static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width, height; + std::tie(width, height) = original_size; + int grid_x, grid_y; + std::tie(grid_x, grid_y) = grid; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + int grid_width = refine_width / grid_x; + int grid_height = refine_height / grid_y; + + // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) + auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair + int best_grid_width, best_grid_height; + std::tie(best_grid_width, best_grid_height) = best_grid_size; + + // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) + std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) + return refine_size; +} + +inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); +} + +static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector> candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.emplace_back(m, split_grids_nums / m); + } + ++m; + } + } + + std::pair best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; +} + +// inspired from LLaVA-UHD: +// -> https://arxiv.org/pdf/2403.11703 +// -> https://github.com/thunlp/LLaVA-UHD +// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 +static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { + const std::pair original_size={img->nx,img->ny}; + const int original_width = img->nx; + const int original_height = img->ny; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + std::vector> images; + LOG_TEE("%s: multiple %d\n", __func__, multiple); + images.push_back(std::vector()); + + if (multiple <= 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); + clip_image_u8 * source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.resize(best_size, Image.Resampling.BICUBIC) + images[images.size()-1].push_back(source_image); + } + else if (multiple > 1) { + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); + clip_image_u8 * source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); + images[images.size()-1].push_back(source_image); + + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + + auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + clip_image_u8 * refine_image = clip_image_u8_init(); + bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); + + LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); + + // split_to_patches + int width = refine_image->nx; + int height = refine_image->ny; + int grid_x = int(width / best_grid.first); + int grid_y = int(height / best_grid.second); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ + images.push_back(std::vector()); + for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ + clip_image_u8 * patch = clip_image_u8_init(); + patch->nx = grid_x; + patch->ny = grid_y; + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = patches_i; y < patches_i + grid_y; ++y) { + for (int x = patches_j; x < patches_j + grid_x; ++x) { + const int i = 3 * (y * refine_image->nx + x); + const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); + patch->buf[j] = refine_image->buf[i]; + patch->buf[j+1] = refine_image->buf[i+1]; + patch->buf[j+2] = refine_image->buf[i+2]; + } + } + images[images.size()-1].push_back(patch); + } + } + } + return images; +} + +int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { + const int max_slice_nums=9; + const int scale_resolution=448; + const int original_width = ctx_clip->load_image_size->width; + const int original_height = ctx_clip->load_image_size->height; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + return best_grid.first; +} + // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { + if (clip_is_minicpmv(ctx)) { + std::vector> imgs = uhd_slice_image(img); + res_imgs->size = 0; + for (size_t i = 0; i < imgs.size(); ++i) { + res_imgs->size += imgs[i].size(); + } + res_imgs->data = new clip_image_f32[res_imgs->size]; + int idx = 0; + for (size_t i = 0; i < imgs.size(); ++i) { + for (size_t j = 0; j < imgs[i].size(); ++j) { + LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); + clip_image_f32 * res = clip_image_f32_init(); + normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); + res_imgs->data[idx++] = *res; + clip_image_f32_free(res); + } + } + return true; + } + bool pad_to_square = true; if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); @@ -1807,11 +2145,99 @@ int clip_n_patches(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { n_patches /= 4; + } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + n_patches = 96; } return n_patches; } +static std::vector>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector> & pos) { + assert(embed_dim % 2 == 0); + int H = pos.size(); + int W = pos[0].size(); + + std::vector omega(embed_dim / 2); + for (int i = 0; i < embed_dim / 2; ++i) { + omega[i] = 1.0 / pow(10000.0, static_cast(i) / (embed_dim / 2)); + } + + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + float out_value = pos[h][w] * omega[d]; + emb[h][w][d] = sin(out_value); + emb[h][w][d + embed_dim / 2] = cos(out_value); + } + } + } + + return emb; +} + +static std::vector>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector>> & grid) { + assert(embed_dim % 2 == 0); + std::vector>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2) + std::vector>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2) + + int H = emb_h.size(); + int W = emb_h[0].size(); + std::vector>> emb(H, std::vector>(W, std::vector(embed_dim))); + + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + for (int d = 0; d < embed_dim / 2; ++d) { + emb[h][w][d] = emb_h[h][w][d]; + emb[h][w][d + embed_dim / 2] = emb_w[h][w][d]; + } + } + } + return emb; +} + +static std::vector> get_2d_sincos_pos_embed(int embed_dim, const std::pair image_size) { + int grid_h_size = image_size.first; + int grid_w_size = image_size.second; + + std::vector grid_h(grid_h_size); + std::vector grid_w(grid_w_size); + + for (int i = 0; i < grid_h_size; ++i) { + grid_h[i] = static_cast(i); + } + for (int i = 0; i < grid_w_size; ++i) { + grid_w[i] = static_cast(i); + } + + std::vector> grid(grid_h_size, std::vector(grid_w_size)); + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid[h][w] = grid_w[w]; + } + } + std::vector>> grid_2d = {grid, grid}; + for (int h = 0; h < grid_h_size; ++h) { + for (int w = 0; w < grid_w_size; ++w) { + grid_2d[0][h][w] = grid_h[h]; + grid_2d[1][h][w] = grid_w[w]; + } + } + + std::vector>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d); + + int H = image_size.first; + int W = image_size.second; + std::vector> pos_embed_2d(H * W, std::vector(embed_dim)); + for (int h = 0; h < H; ++h) { + for (int w = 0; w < W; ++w) { + pos_embed_2d[w * H + h] = pos_embed_3d[h][w]; + } + } + + return pos_embed_2d; +} + bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); @@ -1834,18 +2260,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (ctx->has_llava_projector) { GGML_ASSERT(batch_size == 1); // TODO: support multiple images } + if (ctx->has_minicpmv_projector) { + GGML_ASSERT(batch_size == 1); + } // build the inference graph - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); // set inputs const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + if (ctx->has_minicpmv_projector) { + image_size_width = imgs->data[0].nx; + image_size_height = imgs->data[0].ny; + } const int patch_size = hparams.patch_size; - const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); { @@ -1855,7 +2290,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (size_t i = 0; i < imgs->size; i++) { const int nx = imgs->data[i].nx; const int ny = imgs->data[i].ny; - GGML_ASSERT(nx == image_size && ny == image_size); + if (!ctx->has_minicpmv_projector) { + GGML_ASSERT(nx == image_size && ny == image_size); + } const int n = nx * ny; @@ -1872,37 +2309,75 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); free(data); } - - { - if (ctx->has_class_embedding) { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); + if (ctx->has_minicpmv_projector) { + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = std::floor(70.0*i/num_positions); + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); } - } - { - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + { + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); + if(ctx->load_image_size==nullptr){ + ctx->load_image_size= clip_image_size_init(); + } + int pos_w = ctx->load_image_size->width/patch_size; + int pos_h = ctx->load_image_size->height/patch_size; + int embed_dim = 4096; + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); - int* positions_data = (int*)malloc(ggml_nbytes(positions)); - for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; + float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed)); + for(int i=0;ihas_class_embedding) { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); - int* patches_data = (int*)malloc(ggml_nbytes(patches)); - for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); + } + } + + { + struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); + + int* positions_data = (int*)malloc(ggml_nbytes(positions)); + for (int i = 0; i < num_positions; i++) { + positions_data[i] = i; + } + ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); + free(positions_data); + } + + { + struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + int* patches_data = (int*)malloc(ggml_nbytes(patches)); + for (int i = 0; i < num_patches; i++) { + patches_data[i] = i + 1; + } + ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); + free(patches_data); } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); } if (ggml_backend_is_cpu(ctx->backend)) { @@ -2072,7 +2547,14 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { return ctx->vision_model.mm_3_b->ne[0]; } + if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { + return 4096; + } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } + +bool clip_is_minicpmv(const struct clip_ctx * ctx) { + return ctx->has_minicpmv_projector; +} diff --git a/examples/llava/clip.h b/examples/llava/clip.h index ca3631384..2ff4d3992 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -18,14 +18,17 @@ # define CLIP_API #endif -struct clip_ctx; - #ifdef __cplusplus extern "C" { #endif struct clip_ctx; +struct clip_image_size { + int width; + int height; +}; + struct clip_image_u8_batch { struct clip_image_u8 * data; size_t size; @@ -55,6 +58,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); +CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); + +CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); @@ -78,6 +85,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); +CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx); + #ifdef __cplusplus } #endif diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 63878d176..916d9dc40 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -202,6 +202,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector return true; } +static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) { + int width = image->nx; + int height = image->ny; + int num_patches = (height / patch_size) * (width / patch_size); + clip_image_f32 * patch = clip_image_f32_init(); + patch->nx = patch_size * num_patches; + patch->ny = patch_size; + patch->buf.resize(3 * patch->nx * patch->ny); + + int patch_index = 0; + + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + for (int pi = 0; pi < patch_size; ++pi) { + for (int pj = 0; pj < patch_size; ++pj) { + int input_index = ((i + pi) * width + (j + pj)) * 3; + int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3; + patch->buf[output_index] = image->buf[input_index]; + patch->buf[output_index+1] = image->buf[input_index+1]; + patch->buf[output_index+2] = image->buf[input_index+2]; + } + } + patch_index++; + } + } + return patch; +} static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 @@ -218,7 +245,44 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { + if (clip_is_minicpmv(ctx_clip)) { + std::vector image_embd_v; + image_embd_v.resize(img_res_v.size); + struct clip_image_size * load_image_size = clip_image_size_init(); + for (size_t i = 0; i < img_res_v.size; i++) { + const int64_t t_img_enc_step_start_us = ggml_time_us(); + image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); + int patch_size=14; + load_image_size->width = img_res_v.data[i].nx; + load_image_size->height = img_res_v.data[i].ny; + clip_add_load_image_size(ctx_clip, load_image_size); + const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + if (!encoded) { + LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + return false; + } + const int64_t t_img_enc_steop_batch_us = ggml_time_us(); + LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); + } + const int64_t t_img_enc_batch_us = ggml_time_us(); + LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + + int n_img_pos_out = 0; + for (size_t i = 0; i < image_embd_v.size(); i++) { + std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); + n_img_pos_out += clip_n_patches(ctx_clip); + } + *n_img_pos = n_img_pos_out; + for (size_t i = 0; i < image_embd_v.size(); i++) { + free(image_embd_v[i]); + } + image_embd_v.clear(); + load_image_size->width = img->nx; + load_image_size->height = img->ny; + clip_add_load_image_size(ctx_clip, load_image_size); + LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); + } + else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 @@ -228,7 +292,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return false; } - } else { + } + else { // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working std::vector image_embd_v; @@ -297,7 +362,11 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * } bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model + int num_max_patches = 6; + if (clip_is_minicpmv(ctx_clip)) { + num_max_patches = 10; + } + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model if (!image_embd) { LOG_TEE("Unable to allocate memory for image embeddings\n"); return false; diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 19212f6e9..b6feb3027 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -17,12 +17,11 @@ # define LLAVA_API #endif -struct clip_ctx; - #ifdef __cplusplus extern "C" { #endif +struct clip_ctx; struct llava_image_embed { float * embed; int n_image_pos; @@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); /** build an image embed from a path to an image filename */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** free an embedding made with llava_image_embed_make_* */ +LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp new file mode 100644 index 000000000..f951b57b2 --- /dev/null +++ b/examples/llava/minicpmv-cli.cpp @@ -0,0 +1,309 @@ +#include "ggml.h" +#include "log.h" +#include "common.h" +#include "clip.h" +#include "llava.h" +#include "llama.h" + +#include +#include +#include + +struct llava_context { + struct clip_ctx * ctx_clip = NULL; + struct llama_context * ctx_llama = NULL; + struct llama_model * model = NULL; +}; + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + +static struct llama_model * llava_init(gpt_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); + + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + + llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); + if (params->n_ctx < 2048) { + // warn user here, "Image processing requires at least 2048 context, setting context to 2048" + LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); + ctx_params.n_ctx = 2048; + } else { + ctx_params.n_ctx = params->n_ctx; + } + + llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); + + if (ctx_llama == NULL) { + LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + return NULL; + } + + auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + + ctx_llava->ctx_llama = ctx_llama; + ctx_llava->model = model; + return ctx_llava; +} + +static void llava_free(struct llava_context * ctx_llava) { + if (ctx_llava->ctx_clip) { + clip_free(ctx_llava->ctx_clip); + ctx_llava->ctx_clip = NULL; + } + + llama_free(ctx_llava->ctx_llama); + llama_free_model(ctx_llava->model); + llama_backend_free(); +} + +static struct clip_ctx * clip_init_context(gpt_params * params) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + return ctx_clip; +} + +static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { + int N = (int) tokens.size(); + for (int i = 0; i < N; i += n_batch) { + int n_eval = (int) tokens.size() - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + return false; + } + *n_past += n_eval; + } + return true; +} + +static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { + std::vector tokens; + tokens.push_back(id); + return eval_tokens(ctx_llama, tokens, 1, n_past); +} + +static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ + std::string str2 = str; + std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); + return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); +} + +static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { + float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); + std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); + + auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + slice_embed->embed = image_embed; + slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); + llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); + llava_image_embed_free(slice_embed); +} + +static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { + std::string system_prompt; + int idx = 0; + int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); + system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; + LOG_TEE("%s: image token past: %d\n", __func__, n_past); + eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (num_image_embeds > 1) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } + } + } + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + } + LOG_TEE("%s: image token past: %d\n", __func__, n_past); +} + +static const char * sample(struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_llama, + int * n_past) { + const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); + llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + static std::string ret; + if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { + ret = ""; + } else { + ret = llama_token_to_piece(ctx_llama, id); + } + eval_id(ctx_llama, id, n_past); + return ret.c_str(); +} + +static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ + auto ctx_clip = clip_init_context(params); + auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); + if (!embeds) { + std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; + return NULL; + } + + // process the prompt + if (params->prompt.empty() && params->interactive == false) { + LOG_TEE("prompt should be given or interactive mode should be on"); + return NULL; + } + + auto model = llava_init(params); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); + return NULL; + } + const int64_t t_llava_init_start_us = ggml_time_us(); + auto ctx_llava = llava_init_context(params, model); + ctx_llava->ctx_clip = ctx_clip; + const int64_t t_llava_init_end_us = ggml_time_us(); + float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; + LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); + + const int64_t t_process_image_start_us = ggml_time_us(); + process_image(ctx_llava, embeds, params, n_past); + const int64_t t_process_image_end_us = ggml_time_us(); + float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; + LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); + + llava_image_embed_free(embeds); + return ctx_llava; +} + +static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ + std::string user_prompt = prompt; + if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; + + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); + // generate the response + + LOG_TEE("\n"); + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + return ctx_sampling; +} + +static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ + + const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + return tmp; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + show_additional_info(argc, argv); + return 1; + } + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("llava", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS + + if (params.mmproj.empty() || (params.image.empty())) { + gpt_params_print_usage(argc, argv, params); + show_additional_info(argc, argv); + return 1; + } + + for (auto & image : params.image) { + int n_past = 0; + auto ctx_llava = minicpmv_init(¶ms, image, n_past); + + if (!params.prompt.empty()) { + LOG_TEE("%s\n", params.prompt.c_str()); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + bool have_tmp = false; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0){ + if(!have_tmp)continue; + else break; + } + if (strstr(tmp, "###")) break; // Yi-VL behavior + have_tmp = true; + printf("%s", tmp); + if (strstr(response.c_str(), "")) break; // minicpm-v + + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + }else { + while (true) { + LOG_TEE(""); + std::string prompt; + std::getline(std::cin, prompt); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior + printf("%s", tmp);// mistral llava-1.6 + if (strstr(response.c_str(), "")) break; // minicpm-v + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + } + } + printf("\n"); + llama_print_timings(ctx_llava->ctx_llama); + + ctx_llava->model = NULL; + llava_free(ctx_llava); + } + + return 0; +} diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py new file mode 100644 index 000000000..12cdd1281 --- /dev/null +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -0,0 +1,382 @@ +import argparse +import os +import json +import re + +import torch +import numpy as np +from gguf import * +from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig + +TEXT = "clip.text" +VISION = "clip.vision" + + +def add_key_str(raw_key: str, arch: str) -> str: + return raw_key.format(arch=arch) + + +def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool: + if name in ( + "logit_scale", + "text_model.embeddings.position_ids", + "vision_model.embeddings.position_ids", + ): + return True + + if has_minicpmv and name in ["visual_projection.weight"]: + return True + + if name.startswith("v") and not has_vision: + return True + + if name.startswith("t") and not has_text: + return True + + return False + + +def get_tensor_name(name: str) -> str: + if "projection" in name: + return name + if "mm_projector" in name: + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name + + return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") + + +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a significant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) +ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument("--text-only", action="store_true", required=False, + help="Save a text-only model. It can't be used to encode images") +ap.add_argument("--vision-only", action="store_true", required=False, + help="Save a vision-only model. It can't be used to encode texts") +ap.add_argument("--clip-model-is-vision", action="store_true", required=False, + help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") +ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) +# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 +default_image_mean = [0.48145466, 0.4578275, 0.40821073] +default_image_std = [0.26862954, 0.26130258, 0.27577711] +ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) +ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) + +# with proper +args = ap.parse_args() + + +if args.text_only and args.vision_only: + print("--text-only and --image-only arguments cannot be specified at the same time.") + exit(1) + +if args.use_f32: + print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + +# output in the same directory as the model if output_dir is None +dir_model = args.model_dir + +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: + vocab = None + tokens = None +else: + with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: + vocab = json.load(f) + tokens = [key for key in vocab] + +# possible data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if args.use_f32: + ftype = 0 + +# if args.clip_model_is_vision or args.clip_model_is_openclip: +# model = CLIPVisionModel.from_pretrained(dir_model) +# processor = None +# else: +# model = CLIPModel.from_pretrained(dir_model) +# processor = CLIPProcessor.from_pretrained(dir_model) + +default_vision_config = { + "hidden_size": 1152, + "image_size": 980, + "intermediate_size": 4304, + "model_type": "idefics2", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } +vision_config = Idefics2VisionConfig(**default_vision_config) +model = Idefics2VisionTransformer(vision_config) + +processor = None +# if model.attn_pool is not None: +# model.attn_pool = torch.nn.Identity() + +# model.blocks = model.blocks[:-1] +model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip"))) + +fname_middle = None +has_text_encoder = True +has_vision_encoder = True +has_minicpmv_projector = False +if args.text_only: + fname_middle = "text-" + has_vision_encoder = False +elif args.minicpmv_projector is not None: + fname_middle = "mmproj-" + has_text_encoder = False + has_minicpmv_projector = True +elif args.vision_only: + fname_middle = "vision-" + has_text_encoder = False +else: + fname_middle = "" + +output_dir = args.output_dir if args.output_dir is not None else dir_model +os.makedirs(output_dir, exist_ok=True) +output_prefix = os.path.basename(output_dir).replace("ggml_", "") +fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") +fout = GGUFWriter(path=fname_out, arch="clip") + +fout.add_bool("clip.has_text_encoder", has_text_encoder) +fout.add_bool("clip.has_vision_encoder", has_vision_encoder) +fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector) +fout.add_file_type(ftype) +if args.text_only: + fout.add_description("text-only CLIP model") +elif args.vision_only and not has_minicpmv_projector: + fout.add_description("vision-only CLIP model") +elif has_minicpmv_projector: + fout.add_description("image encoder for MiniCPM-V") + # add projector type + fout.add_string("clip.projector_type", "resampler") +else: + fout.add_description("two-tower CLIP model") + +if has_vision_encoder: + # vision_model hparams + fout.add_uint32("clip.vision.image_size", 448) + fout.add_uint32("clip.vision.patch_size", 14) + fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152) + fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304) + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16) + fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) + block_count = 26 + fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) + + if processor is not None: + image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean + image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std + else: + image_mean = args.image_mean if args.image_mean is not None else default_image_mean + image_std = args.image_std if args.image_std is not None else default_image_std + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + +use_gelu = True +fout.add_bool("clip.use_gelu", use_gelu) + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + +def _replace_name_resampler(s, v): + if re.match("resampler.pos_embed", s): + return { + s: v, + re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + } + if re.match("resampler.proj", s): + return { + re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))), + re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(), + } + if re.match("resampler.attn.in_proj_.*", s): + return { + re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0], + re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1], + re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2], + } + return {s: v} + +if has_minicpmv_projector: + projector = torch.load(args.minicpmv_projector) + new_state_dict = {} + for k, v in projector.items(): + kvs = _replace_name_resampler(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv + projector = new_state_dict + ftype_cur = 0 + for name, data in projector.items(): + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + if ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + fout.add_tensor(name, data) + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + + print("Projector tensors added\n") + +def _replace_name(s, v): + s = "vision_model." + s + if re.match("vision_model.embeddings.position_embedding", s): + v = v.unsqueeze(0) + return {s: v} + + return {s: v} + +state_dict = model.state_dict() +new_state_dict = {} +for k, v in state_dict.items(): + kvs = _replace_name(k, v) + for nk, nv in kvs.items(): + new_state_dict[nk] = nv +state_dict = new_state_dict +for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) + + +fout.write_header_to_file() +fout.write_kv_data_to_file() +fout.write_tensors_to_file() +fout.close() + +print("Done. Output file: " + fname_out) diff --git a/examples/llava/minicpmv-surgery.py b/examples/llava/minicpmv-surgery.py new file mode 100644 index 000000000..2b6bce7cf --- /dev/null +++ b/examples/llava/minicpmv-surgery.py @@ -0,0 +1,47 @@ +import argparse +import os +import torch +from transformers import AutoModel, AutoTokenizer + +ap = argparse.ArgumentParser() +ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model") +args = ap.parse_args() + +# find the model part that includes the the multimodal projector weights +model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True) +checkpoint = model.state_dict() + +# get a list of mm tensor names +mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")] + +# store these tensors in a new dictionary and torch.save them +projector = {name: checkpoint[name].float() for name in mm_tensors} +torch.save(projector, f"{args.model}/minicpmv.projector") + +clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] +if len(clip_tensors) > 0: + clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors} + torch.save(clip, f"{args.model}/minicpmv.clip") + + # added tokens should be removed to be able to convert Mistral models + if os.path.exists(f"{args.model}/added_tokens.json"): + with open(f"{args.model}/added_tokens.json", "w") as f: + f.write("{}\n") + +config = model.llm.config +config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5" +config.auto_map = { + "AutoConfig": "configuration_minicpm.MiniCPMConfig", + "AutoModel": "modeling_minicpm.MiniCPMModel", + "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM", + "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification" +} +model.llm.save_pretrained(f"{args.model}/model") +tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) +tok.save_pretrained(f"{args.model}/model") +# os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py") + +print("Done!") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") +print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.") diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index fbfd0cdd7..dfe5fbe62 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -2,3 +2,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu pillow~=10.2.0 torch~=2.2.1 +torchvision==0.17.1 diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index fb20ad93f..81cf1629c 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -58,11 +58,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the target model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // Tokenize the prompt std::vector inp; diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index d713f6f21..5f04709f5 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -22,11 +22,11 @@ int main(int argc, char ** argv){ llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; GGML_ASSERT(model != nullptr); // tokenize the prompt diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 0b171c872..400f3e0b0 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -26,12 +26,11 @@ int main(int argc, char ** argv){ llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); - GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // tokenize the prompt std::vector inp; @@ -65,7 +64,7 @@ int main(int argc, char ** argv){ } const int n_input = inp.size(); - const int n_ctx = params.n_ctx; + const int n_ctx = llama_n_ctx(ctx); int n_drafted = 0; int n_accept = 0; diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 80ecd925d..d53a9828c 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -34,12 +34,11 @@ int main(int argc, char ** argv){ llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); - GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // tokenize the prompt std::vector inp; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a0d817b1a..6e0635a66 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -124,6 +124,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vectorchat_template, chat_msgs, new_msg, role == "user"); chat_msgs.push_back({role, content}); + LOG("formatted: %s\n", formatted.c_str()); return formatted; } @@ -206,7 +207,10 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + model = llama_init.model; + ctx = llama_init.context; if (sparams.cfg_scale > 1.f) { struct llama_context_params lparams = llama_context_params_from_gpt_params(params); ctx_guidance = llama_new_context_with_model(model, lparams); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7faeaec97..621a1c959 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -129,11 +129,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model = NULL; - llama_context * ctx = NULL; - // load the target model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; // load the prompts from an external file if there are any if (params.prompt.empty()) { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index dbe445391..372684f09 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -2018,11 +2018,11 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model; - llama_context * ctx; - // load the model and apply lora adapter, if any - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py old mode 100644 new mode 100755 index 504ed98df..eb000d5cc --- a/examples/pydantic_models_to_grammar_examples.py +++ b/examples/pydantic_models_to_grammar_examples.py @@ -1,8 +1,15 @@ -# Function calling example using pydantic models. +#!/usr/bin/env python3 + +"""Function calling example using pydantic models.""" + from __future__ import annotations +import argparse import datetime import json +import logging +import textwrap +import sys from enum import Enum from typing import Optional, Union @@ -12,30 +19,54 @@ from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation) -# Function to get completion on the llama.cpp server with grammar. -def create_completion(prompt, grammar): +def create_completion(host, prompt, gbnf_grammar): + """Calls the /completion API on llama-server. + + See + https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints + """ + print(f" Request:\n Grammar:\n{textwrap.indent(gbnf_grammar, ' ')}\n Prompt:\n{textwrap.indent(prompt.rstrip(), ' ')}") headers = {"Content-Type": "application/json"} - data = {"prompt": prompt, "grammar": grammar} - - response = requests.post("http://127.0.0.1:8080/completion", headers=headers, json=data) - data = response.json() - + data = {"prompt": prompt, "grammar": gbnf_grammar} + result = requests.post(f"http://{host}/completion", headers=headers, json=data).json() assert data.get("error") is None, data - - print(data["content"]) - return data["content"] + logging.info("Result: %s", result) + content = result["content"] + print(f" Model: {result['model']}") + print(f" Result:\n{textwrap.indent(json.dumps(json.loads(content), indent=2), ' ')}") + return content # A function for the agent to send a message to the user. class SendMessageToUser(BaseModel): - """ - Send a message to the User. - """ + """Send a message to the User.""" chain_of_thought: str = Field(..., description="Your chain of thought while sending the message.") message: str = Field(..., description="Message you want to send to the user.") def run(self): - print(self.message) + print(f"SendMessageToUser: {self.message}") + + +def example_rce(host): + """Minimal test case where the LLM call an arbitrary python function.""" + print("- example_rce") + tools = [SendMessageToUser] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + user_message = "What is 42 * 42?" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + tools_map = {tool.__name__:tool for tool in tools} + # This finds "SendMessageToUser": + tool = tools_map.get(json_data["function"]) + if not tool: + print(f"Error: unknown tool {json_data['function']}") + return 1 + tool(**json_data["function_parameters"]).run() + return 0 # Enum for the calculator tool. @@ -46,11 +77,11 @@ class MathOperation(Enum): DIVIDE = "divide" -# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt. +# Simple pydantic calculator tool for the agent that can add, subtract, +# multiply, and divide. Docstring and description of fields will be used in +# system prompt. class Calculator(BaseModel): - """ - Perform a math operation on two numbers. - """ + """Perform a math operation on two numbers.""" number_one: Union[int, float] = Field(..., description="First number.") operation: MathOperation = Field(..., description="Math operation to perform.") number_two: Union[int, float] = Field(..., description="Second number.") @@ -68,55 +99,61 @@ class Calculator(BaseModel): raise ValueError("Unknown operation.") -# Here the grammar gets generated by passing the available function models to generate_gbnf_grammar_and_documentation function. This also generates a documentation usable by the LLM. -# pydantic_model_list is the list of pydanitc models -# outer_object_name is an optional name for an outer object around the actual model object. Like a "function" object with "function_parameters" which contains the actual model object. If None, no outer object will be generated -# outer_object_content is the name of outer object content. -# model_prefix is the optional prefix for models in the documentation. (Default="Output Model") -# fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( - pydantic_model_list=[SendMessageToUser, Calculator], outer_object_name="function", - outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") +def example_calculator(host): + """Have the LLM ask to get a calculation done. -print(gbnf_grammar) -print(documentation) + Here the grammar gets generated by passing the available function models to + generate_gbnf_grammar_and_documentation function. This also generates a + documentation usable by the LLM. -system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + pydantic_model_list is the list of pydantic models outer_object_name is an + optional name for an outer object around the actual model object. Like a + "function" object with "function_parameters" which contains the actual model + object. If None, no outer object will be generated outer_object_content is + the name of outer object content. -user_message = "What is 42 * 42?" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) -# This should output something like this: -# { -# "function": "calculator", -# "function_parameters": { -# "number_one": 42, -# "operation": "multiply", -# "number_two": 42 -# } -# } -function_dictionary = json.loads(text) -if function_dictionary["function"] == "calculator": - function_parameters = {**function_dictionary["function_parameters"]} - - print(Calculator(**function_parameters).run()) - # This should output: 1764 + model_prefix is the optional prefix for models in the documentation. (Default="Output Model") + fields_prefix is the prefix for the model fields in the documentation. (Default="Output Fields") + """ + print("- example_calculator") + tools = [SendMessageToUser, Calculator] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="function_parameters", model_prefix="Function", fields_prefix="Parameters") + system_message = "You are an advanced AI, tasked to assist the user by calling functions in JSON format. The following are the available functions and their parameters and types:\n\n" + documentation + user_message1 = "What is 42 * 42?" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message1}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + expected = { + "function": "Calculator", + "function_parameters": { + "number_one": 42, + "operation": "multiply", + "number_two": 42 + } + } + if json_data != expected: + print(" Result is not as expected!") + tools_map = {tool.__name__:tool for tool in tools} + # This finds "Calculator": + tool = tools_map.get(json_data["function"]) + if not tool: + print(f"Error: unknown tool {json_data['function']}") + return 1 + result = tool(**json_data["function_parameters"]).run() + print(f" Call {json_data['function']} gave result {result}") + return 0 -# A example structured output based on pydantic models. The LLM will create an entry for a Book database out of an unstructured text. class Category(Enum): - """ - The category of the book. - """ + """The category of the book.""" Fiction = "Fiction" NonFiction = "Non-Fiction" class Book(BaseModel): - """ - Represents an entry about a book. - """ + """Represents an entry about a book.""" title: str = Field(..., description="Title of the book.") author: str = Field(..., description="Author of the book.") published_year: Optional[int] = Field(..., description="Publishing year of the book.") @@ -125,33 +162,42 @@ class Book(BaseModel): summary: str = Field(..., description="Summary of the book.") -# We need no additional parameters other than our list of pydantic models. -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation([Book]) +def example_struct(host): + """A example structured output based on pydantic models. -system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + The LLM will create an entry for a Book database out of an unstructured + text. We need no additional parameters other than our list of pydantic + models. + """ + print("- example_struct") + tools = [Book] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(pydantic_model_list=tools) + system_message = "You are an advanced AI, tasked to create a dataset entry in JSON for a Book. The following is the expected output model:\n\n" + documentation + text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + # In this case, there's no function nor function_parameters. + # Here the result will vary based on the LLM used. + keys = sorted(["title", "author", "published_year", "keywords", "category", "summary"]) + if keys != sorted(json_data.keys()): + print(f"Unexpected result: {sorted(json_data.keys())}") + return 1 + book = Book(**json_data) + print(f" As a Book object: %s" % book) + return 0 -text = """The Feynman Lectures on Physics is a physics textbook based on some lectures by Richard Feynman, a Nobel laureate who has sometimes been called "The Great Explainer". The lectures were presented before undergraduate students at the California Institute of Technology (Caltech), during 1961–1963. The book's co-authors are Feynman, Robert B. Leighton, and Matthew Sands.""" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) - -json_data = json.loads(text) - -print(Book(**json_data)) -# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition. def get_current_datetime(output_format: Optional[str] = None): - """ - Get the current date and time in the given format. + """Get the current date and time in the given format. + Args: output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S' """ - if output_format is None: - output_format = '%Y-%m-%d %H:%M:%S' - return datetime.datetime.now().strftime(output_format) + return datetime.datetime.now().strftime(output_format or "%Y-%m-%d %H:%M:%S") -# Example function to get the weather +# Example function to get the weather. def get_current_weather(location, unit): """Get the current weather in a given location""" if "London" in location: @@ -160,68 +206,107 @@ def get_current_weather(location, unit): return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value}) elif "North Pole" in location: return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value}) - else: - return json.dumps({"location": location, "temperature": "unknown"}) + return json.dumps({"location": location, "temperature": "unknown"}) -# Here is a function definition in OpenAI style -current_weather_tool = { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. San Francisco, CA", +def example_concurrent(host): + """An example for parallel function calling with a Python function, a pydantic + function model and an OpenAI like function definition. + """ + print("- example_concurrent") + # Function definition in OpenAI style. + current_weather_tool = { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, }, - "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + "required": ["location"], }, - "required": ["location"], }, - }, -} + } + # Convert OpenAI function definition into pydantic model. + current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) + # Add the actual function to a pydantic model. + current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) -# Convert OpenAI function definition into pydantic model -current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool) -# Add the actual function to a pydantic model -current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather) + # Convert normal Python function to a pydantic model. + current_datetime_model = create_dynamic_model_from_function(get_current_datetime) -# Convert normal Python function to a pydantic model -current_datetime_model = create_dynamic_model_from_function(get_current_datetime) - -tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + tools = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model] + gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( + pydantic_model_list=tools, outer_object_name="function", + outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) + system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation + text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" + prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" + text = create_completion(host, prompt, gbnf_grammar) + json_data = json.loads(text) + expected = [ + { + "function": "get_current_datetime", + "params": { + "output_format": "%Y-%m-%d %H:%M:%S" + } + }, + { + "function": "get_current_weather", + "params": { + "location": "London", + "unit": "celsius" + } + }, + { + "function": "Calculator", + "params": { + "number_one": 42, + "operation": "multiply", + "number_two": 42 + } + } + ] + res = 0 + if json_data != expected: + print(" Result is not as expected!") + print(" This can happen on highly quantized models") + res = 1 + tools_map = {tool.__name__:tool for tool in tools} + for call in json_data: + tool = tools_map.get(call["function"]) + if not tool: + print(f"Error: unknown tool {call['function']}") + return 1 + result = tool(**call["params"]).run() + print(f" Call {call['function']} returned {result}") + # Should output something like this: + # Call get_current_datetime returned 2024-07-15 09:50:38 + # Call get_current_weather returned {"location": "London", "temperature": "42", "unit": "celsius"} + # Call Calculator returned 1764 + return res -gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation( - pydantic_model_list=tool_list, outer_object_name="function", - outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True) - -system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation +def main(): + parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__) + parser.add_argument("--host", default="localhost:8080", help="llama.cpp server") + parser.add_argument("-v", "--verbose", action="store_true", help="enables logging") + args = parser.parse_args() + logging.basicConfig(level=logging.INFO if args.verbose else logging.ERROR) + ret = 0 + # Comment out below to only run the example you want. + ret = ret or example_rce(args.host) + ret = ret or example_calculator(args.host) + ret = ret or example_struct(args.host) + ret = ret or example_concurrent(args.host) + return ret -text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42""" -prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant" - -text = create_completion(prompt=prompt, grammar=gbnf_grammar) - -json_data = json.loads(text) - -print(json_data) -# Should output something like this: -# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}] - - -for call in json_data: - if call["function"] == "Calculator": - print(Calculator(**call["params"]).run()) - elif call["function"] == "get_current_datetime": - print(current_datetime_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] - elif call["function"] == "get_current_weather": - print(current_weather_tool_model(**call["params"]).run()) # pyright: ignore[reportAttributeAccessIssue] -# Should output something like this: -# 2024-01-14 13:36:06 -# {"location": "London", "temperature": "42", "unit": "celsius"} -# 1764 +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 1578c4afb..7312309ae 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -16,44 +16,44 @@ struct quant_option { }; static const std::vector QUANT_OPTIONS = { - { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, - { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, - { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, - { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", }, - { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, - { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, - { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, - { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, - { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, - { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, - { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, - { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, - { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, - { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, - { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, - { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, - { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, - { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, - { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, - { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, - { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, - { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, - { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, - { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, - { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, - { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, - { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, + { "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", }, + { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, + { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, + { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, + { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, + { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, + { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, + { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, + { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, + { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, + { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, + { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, + { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, + { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, + { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. - { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; @@ -91,7 +91,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp } // usage: -// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] +// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // [[noreturn]] static void usage(const char * executable) { diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index eb89d16da..65b19ce71 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -148,11 +148,12 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); - llama_model * model; - llama_context * ctx; - // load the model - std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; diff --git a/examples/rpc/README.md b/examples/rpc/README.md index e1da801f2..adedc8909 100644 --- a/examples/rpc/README.md +++ b/examples/rpc/README.md @@ -1,5 +1,9 @@ ## Overview +> [!IMPORTANT] +> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and +> insecure. **Never run the RPC server on an open network or in a sensitive environment!** + The `rpc-server` allows running `ggml` backend on a remote host. The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. This can be used for distributed LLM inference with `llama.cpp` in the following way: diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 7c15d2aa4..6342e6488 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -16,7 +16,7 @@ #include struct rpc_server_params { - std::string host = "0.0.0.0"; + std::string host = "127.0.0.1"; int port = 50052; size_t backend_mem = 0; }; @@ -114,6 +114,17 @@ int main(int argc, char * argv[]) { fprintf(stderr, "Invalid parameters\n"); return 1; } + + if (params.host != "127.0.0.1") { + fprintf(stderr, "\n"); + fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str()); + fprintf(stderr, " Never expose the RPC server to an open network!\n"); + fprintf(stderr, " This is an experimental feature and is not secure!\n"); + fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"); + fprintf(stderr, "\n"); + } + ggml_backend_t backend = create_backend(); if (!backend) { fprintf(stderr, "Failed to create backend\n"); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 00c2277ac..3ea7c790d 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -28,10 +28,11 @@ int main(int argc, char ** argv) { std::string result2; // init - llama_model * model; - llama_context * ctx; + llama_init_result llama_init = llama_init_from_gpt_params(params); + + llama_model * model = llama_init.model; + llama_context * ctx = llama_init.context; - std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == nullptr || ctx == nullptr) { fprintf(stderr, "%s : failed to init\n", __func__); return 1; @@ -47,7 +48,7 @@ int main(int argc, char ** argv) { // save state (rng, logits, embedding and kv_cache) to file { std::vector state_mem(llama_state_get_size(ctx)); - const size_t written = llama_state_get_data(ctx, state_mem.data()); + const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size()); FILE *fp_write = fopen("dump_state.bin", "wb"); fwrite(state_mem.data(), 1, written, fp_write); @@ -99,13 +100,16 @@ int main(int argc, char ** argv) { // load state (rng, logits, embedding and kv_cache) from file { - std::vector state_mem(llama_state_get_size(ctx2)); + std::vector state_mem; FILE * fp_read = fopen("dump_state.bin", "rb"); + fseek(fp_read, 0, SEEK_END); + state_mem.resize(ftell(fp_read)); + fseek(fp_read, 0, SEEK_SET); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); fclose(fp_read); - if (read != llama_state_set_data(ctx2, state_mem.data())) { + if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); llama_free(ctx2); llama_free_model(model); @@ -159,13 +163,16 @@ int main(int argc, char ** argv) { // load state (rng, logits, embedding and kv_cache) from file { - std::vector state_mem(llama_state_get_size(ctx3)); + std::vector state_mem; FILE * fp_read = fopen("dump_state.bin", "rb"); + fseek(fp_read, 0, SEEK_END); + state_mem.resize(ftell(fp_read)); + fseek(fp_read, 0, SEEK_SET); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); fclose(fp_read); - if (read != llama_state_set_data(ctx3, state_mem.data())) { + if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); llama_free(ctx3); llama_free_model(model); @@ -182,7 +189,7 @@ int main(int argc, char ** argv) { { // save kv of seq 0 std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0); + const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); if (ncopy != seq_store.size()) { fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); llama_free(ctx3); @@ -196,7 +203,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 - const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1); + const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); if (nset != seq_store.size()) { fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); llama_free(ctx3); diff --git a/examples/server/README.md b/examples/server/README.md index cb45ee06d..e17595fe8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -5,7 +5,7 @@ Fast, lightweight, pure C/C++ HTTP server based on [httplib](https://github.com/ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. **Features:** - * LLM inference of F16 and quantum models on GPU and CPU + * LLM inference of F16 and quantized models on GPU and CPU * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes * Parallel decoding with multi-user support * Continuous batching @@ -15,69 +15,238 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). -**Command line options:** +## Usage -- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response. -- `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores). -- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`. -- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` -- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). -- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused -- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused -- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused -- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. -- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`. -- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. -- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048` -- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512` -- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. -- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. -- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems -- `--numa distribute`: Spread execution evenly over all nodes -- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on -- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437 -- `--numa`: Attempt optimizations that may help on some NUMA systems. -- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. -- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. -- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600` -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1` -- `--port`: Set the port to listen. Default: `8080` -- `--path`: Path from which to serve static files. Default: disabled -- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. -- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. -- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled -- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error. -- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled -- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) -- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. -- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled. -- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512` -- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1` -- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. -- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled -- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled. -- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) -- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled -- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json` -- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn` -- `--rope-freq-base N` : RoPE frequency base (default: loaded from model) -- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25) -- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation) -- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0) -- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0) -- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0) -- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls` -- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled) -- `-fa`, `--flash-attn` : enable flash attention (default: disabled). -- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) -- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) -- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. +``` +usage: ./llama-server [options] + +general: + + -h, --help, --usage print usage and exit + --version show version and build info + -v, --verbose print verbose information + --verbosity N set specific verbosity level (default: 0) + --verbose-prompt print a verbose prompt before generation (default: false) + --no-display-prompt don't print prompt at generation (default: false) + -co, --color colorise output to distinguish prompt and user input from generations (default: false) + -s, --seed SEED RNG seed (default: -1, use random seed for < 0) + -t, --threads N number of threads to use during generation (default: 8) + -tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads) + -td, --threads-draft N number of threads to use during generation (default: same as --threads) + -tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft) + --draft N number of tokens to draft for speculative decoding (default: 5) + -ps, --p-split N speculative decoding split probability (default: 0.1) + -lcs, --lookup-cache-static FNAME + path to static lookup cache to use for lookup decoding (not updated by generation) + -lcd, --lookup-cache-dynamic FNAME + path to dynamic lookup cache to use for lookup decoding (updated by generation) + -c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model) + -n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled) + -b, --batch-size N logical maximum batch size (default: 2048) + -ub, --ubatch-size N physical maximum batch size (default: 512) + --keep N number of tokens to keep from the initial prompt (default: 0, -1 = all) + --chunks N max number of chunks to process (default: -1, -1 = all) + -fa, --flash-attn enable Flash Attention (default: disabled) + -p, --prompt PROMPT prompt to start generation with + in conversation mode, this will be used as system prompt + (default: '') + -f, --file FNAME a file containing the prompt (default: none) + --in-file FNAME an input file (repeat to specify multiple files) + -bf, --binary-file FNAME binary file containing the prompt (default: none) + -e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true) + --no-escape do not process escape sequences + -ptc, --print-token-count N print token count every N tokens (default: -1) + --prompt-cache FNAME file to cache prompt state for faster startup (default: none) + --prompt-cache-all if specified, saves user input and generations to cache as well + not supported with --interactive or other interactive options + --prompt-cache-ro if specified, uses the prompt cache but does not update it + -r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode + can be specified more than once for multiple prompts + -sp, --special special tokens output enabled (default: false) + -cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix + if suffix/prefix are not specified, default chat template will be used + (default: false) + -i, --interactive run in interactive mode (default: false) + -if, --interactive-first run in interactive mode and wait for input right away (default: false) + -mli, --multiline-input allows you to write or paste multiple lines without ending each in '\' + --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string + --in-prefix STRING string to prefix user inputs with (default: empty) + --in-suffix STRING string to suffix after user inputs with (default: empty) + --spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) + +sampling: + + --samplers SAMPLERS samplers that will be used for generation in the order, separated by ';' + (default: top_k;tfs_z;typical_p;top_p;min_p;temperature) + --sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt) + --ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf) + --penalize-nl penalize newline tokens (default: false) + --temp N temperature (default: 0.8) + --top-k N top-k sampling (default: 40, 0 = disabled) + --top-p N top-p sampling (default: 0.9, 1.0 = disabled) + --min-p N min-p sampling (default: 0.1, 0.0 = disabled) + --tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled) + --typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) + --repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) + --repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) + --presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled) + --frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) + --dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled) + --dynatemp-exp N dynamic temperature exponent (default: 1.0) + --mirostat N use Mirostat sampling. + Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used. + (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) + --mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1) + --mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0) + -l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion, + i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello', + or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' + --cfg-negative-prompt PROMPT + negative prompt to use for guidance (default: '') + --cfg-negative-prompt-file FNAME + negative prompt file to use for guidance + --cfg-scale N strength of guidance (default: 1.0, 1.0 = disable) + --chat-template JINJA_TEMPLATE + set custom jinja chat template (default: template taken from model's metadata) + if suffix/prefix are specified, template will be disabled + only commonly used templates are accepted: + https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + +grammar: + + --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') + --grammar-file FNAME file to read grammar from + -j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object + For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead + +embedding: + + --pooling {none,mean,cls,last} + pooling type for embeddings, use model default if unspecified + --attention {causal,non-causal} + attention type for embeddings, use model default if unspecified + +context hacking: + + --rope-scaling {none,linear,yarn} + RoPE frequency scaling method, defaults to linear unless specified by the model + --rope-scale N RoPE context scaling factor, expands context by a factor of N + --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model) + --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N + --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size) + --yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) + --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0) + --yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0) + --yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0) + -gan, --grp-attn-n N group-attention factor (default: 1) + -gaw, --grp-attn-w N group-attention width (default: 512.0) + -dkvc, --dump-kv-cache verbose print of the KV cache + -nkvo, --no-kv-offload disable KV offload + -ctk, --cache-type-k TYPE KV cache data type for K (default: f16) + -ctv, --cache-type-v TYPE KV cache data type for V (default: f16) + +perplexity: + + --all-logits return logits for all tokens in the batch (default: false) + --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f + --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400) + --winogrande compute Winogrande score over random tasks from datafile supplied with -f + --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0) + --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f + --multiple-choice-tasks N + number of tasks to use when computing the multiple choice score (default: 0) + --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base + --ppl-stride N stride for perplexity calculation (default: 0) + --ppl-output-type {0,1} output type for perplexity calculation (default: 0) + +parallel: + + -dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled) + -np, --parallel N number of parallel sequences to decode (default: 1) + -ns, --sequences N number of sequences to decode (default: 1) + -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled) + +multi-modality: + + --mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md + --image FILE path to an image file. use with multimodal models. Specify multiple times for batching + +backend: + + --rpc SERVERS comma separated list of RPC servers + --mlock force system to keep model in RAM rather than swapping or compressing + --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock) + --numa TYPE attempt optimizations that help on some NUMA systems + - distribute: spread execution evenly over all nodes + - isolate: only spawn threads on CPUs on the node that execution started on + - numactl: use the CPU map provided by numactl + if run without this previously, it is recommended to drop the system page cache before using this + see https://github.com/ggerganov/llama.cpp/issues/1437 + +model: + + --check-tensors check model tensor data for invalid values (default: false) + --override-kv KEY=TYPE:VALUE + advanced option to override model metadata by key. may be specified multiple times. + types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false + --lora FNAME apply LoRA adapter (implies --no-mmap) + --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap) + --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter + --control-vector FNAME add a control vector + note: this argument can be repeated to add multiple control vectors + --control-vector-scaled FNAME SCALE + add a control vector with user defined scaling SCALE + note: this argument can be repeated to add multiple scaled control vectors + --control-vector-layer-range START END + layer range to apply the control vector(s) to, start and end inclusive + -m, --model FNAME model path (default: models/$filename with filename from --hf-file + or --model-url if set, otherwise models/7B/ggml-model-f16.gguf) + -md, --model-draft FNAME draft model for speculative decoding (default: unused) + -mu, --model-url MODEL_URL model download url (default: unused) + -hfr, --hf-repo REPO Hugging Face model repository (default: unused) + -hff, --hf-file FILE Hugging Face model file (default: unused) + -hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable) + +server: + + --host HOST ip address to listen (default: 127.0.0.1) + --port PORT port to listen (default: 8080) + --path PATH path to serve static files from (default: ) + --embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled) + --api-key KEY API key to use for authentication (default: none) + --api-key-file FNAME path to file containing API keys (default: none) + --ssl-key-file FNAME path to file a PEM-encoded SSL private key + --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate + --timeout N server read/write timeout in seconds (default: 600) + --threads-http N number of threads used to process HTTP requests (default: -1) + --system-prompt-file FNAME + set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications + --log-format {text,json} + log output format: json or text (default: json) + --metrics enable prometheus compatible metrics endpoint (default: disabled) + --no-slots disables slots monitoring endpoint (default: enabled) + --slot-save-path PATH path to save slot kv cache (default: disabled) + --chat-template JINJA_TEMPLATE + set custom jinja chat template (default: template taken from model's metadata) + only commonly used templates are accepted: + https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template + -sps, --slot-prompt-similarity SIMILARITY + how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled) + --lora-init-without-apply + load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) + +logging: + + --simple-io use basic IO for better compatibility in subprocesses and limited consoles + -ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset) + --log-test Run simple logging test + --log-disable Disable trace logs + --log-enable Enable trace logs + --log-file FNAME Specify a log filename (without extension) + --log-new Create a separate new log file on start. Each log file will have unique name: "..log" + --log-append Don't truncate the old log file. +``` -**If compiled with `LLAMA_SERVER_SSL=ON`** -- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key -- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate ## Build @@ -199,7 +368,8 @@ node index.js ## API Endpoints -- **GET** `/health`: Returns the current state of the server: +### GET `/health`: Returns the current state of the server + - 503 -> `{"status": "loading model"}` if the model is still being loaded. - 500 -> `{"status": "error"}` if the model failed to load. - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. @@ -208,7 +378,7 @@ node index.js If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. -- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. +### POST `/completion`: Given a `prompt`, it returns the predicted completion. *Options:* @@ -232,7 +402,7 @@ node index.js `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. - `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. + `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token. By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. @@ -286,7 +456,7 @@ node index.js `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. -### Result JSON +**Response format** - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. @@ -325,7 +495,7 @@ Notice that each `probs` is an array of length `n_probs`. - `tokens_evaluated`: Number of tokens evaluated in total from the prompt - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) -- **POST** `/tokenize`: Tokenize a given text. +### POST `/tokenize`: Tokenize a given text *Options:* @@ -333,13 +503,15 @@ Notice that each `probs` is an array of length `n_probs`. `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` -- **POST** `/detokenize`: Convert tokens to text. +### POST `/detokenize`: Convert tokens to text *Options:* `tokens`: Set the tokens to detokenize. -- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. +### POST `/embedding`: Generate embedding of a given text + +The same as [the embedding example](../embedding) does. *Options:* @@ -347,7 +519,9 @@ Notice that each `probs` is an array of length `n_probs`. `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. -- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream. +### POST `/infill`: For code infilling. + +Takes a prefix and a suffix and returns the predicted completion as stream. *Options:* @@ -359,7 +533,7 @@ Notice that each `probs` is an array of length `n_probs`. - **GET** `/props`: Return current server settings. -### Result JSON +**Response format** ```json { @@ -377,7 +551,9 @@ Notice that each `probs` is an array of length `n_probs`. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `chat_template` - the model's original Jinja2 prompt template -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. +### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API + +Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. *Options:* @@ -429,7 +605,7 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` -- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API. +### POST `/v1/embeddings`: OpenAI-compatible embeddings API *Options:* @@ -463,9 +639,9 @@ Notice that each `probs` is an array of length `n_probs`. }' ``` -- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. +### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`. -### Result JSON +**Response format** ```json [ @@ -526,7 +702,7 @@ Notice that each `probs` is an array of length `n_probs`. ] ``` -- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled: +### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled: Available metrics: - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. @@ -538,13 +714,13 @@ Available metrics: - `llamacpp:requests_processing`: Number of requests processing. - `llamacpp:requests_deferred`: Number of requests deferred. -- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. +### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. *Options:* `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter. -### Result JSON +**Response format** ```json { @@ -558,13 +734,13 @@ Available metrics: } ``` -- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. +### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. *Options:* `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter. -### Result JSON +**Response format** ```json { @@ -578,9 +754,9 @@ Available metrics: } ``` -- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. +### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. -### Result JSON +**Response format** ```json { @@ -589,6 +765,42 @@ Available metrics: } ``` +### GET `/lora-adapters`: Get list of all LoRA adapters + +If an adapter is disabled, the scale will be set to 0. + +**Response format** + +```json +[ + { + "id": 0, + "path": "my_adapter_1.gguf", + "scale": 0.0 + }, + { + "id": 1, + "path": "my_adapter_2.gguf", + "scale": 0.0 + } +] +``` + +### POST `/lora-adapters`: Set list of LoRA adapters + +To disable an adapter, either remove it from the list below, or set scale to 0. + +**Request format** + +To know the `id` of the adapter, use GET `/lora-adapters` + +```json +[ + {"id": 0, "scale": 0.2}, + {"id": 1, "scale": 0.8} +] +``` + ## More examples ### Change system prompt on runtime diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index 987b9a3b8..36818f764 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -21,7 +21,7 @@ let generation_settings = null; // export async function* llama(prompt, params = {}, config = {}) { let controller = config.controller; - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; if (!controller) { controller = new AbortController(); @@ -196,7 +196,7 @@ export const llamaComplete = async (params, controller, callback) => { // Get the model info from the server. This is useful for getting the context window and so on. export const llamaModelInfo = async (config = {}) => { if (!generation_settings) { - const api_url = config.api_url || ""; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; const props = await fetch(`${api_url}/props`).then(r => r.json()); generation_settings = props.default_generation_settings; } diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index 5513e9121..c87dd8f1e 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -14,10 +14,10 @@