diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile index b75163b94..3279c8da4 100644 --- a/.devops/llama-cli-cuda.Dockerfile +++ b/.devops/llama-cli-cuda.Dockerfile @@ -23,15 +23,16 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-cli -j$(nproc) + cmake --build build --config Release --target llama-cli -j$(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libgomp1 -COPY --from=build /app/build/ggml/src/libggml.so /libggml.so -COPY --from=build /app/build/src/libllama.so /libllama.so -COPY --from=build /app/build/bin/llama-cli /llama-cli +COPY --from=build /app/lib/ / +COPY --from=build /app/build/bin/llama-cli / ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile index b5696794f..1edf75cf2 100644 --- a/.devops/llama-cli-musa.Dockerfile +++ b/.devops/llama-cli-musa.Dockerfile @@ -16,15 +16,16 @@ WORKDIR /app COPY . . RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-cli -j$(nproc) + cmake --build build --config Release --target llama-cli -j$(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libgomp1 -COPY --from=build /app/build/ggml/src/libggml.so /libggml.so -COPY --from=build /app/build/src/libllama.so /libllama.so +COPY --from=build /app/lib/ / COPY --from=build /app/build/bin/llama-cli /llama-cli ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile index a40e24205..ea07a4d52 100644 --- a/.devops/llama-server-cuda.Dockerfile +++ b/.devops/llama-server-cuda.Dockerfile @@ -23,15 +23,16 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-server -j$(nproc) + cmake --build build --config Release --target llama-server -j$(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl -COPY --from=build /app/build/ggml/src/libggml.so /libggml.so -COPY --from=build /app/build/src/libllama.so /libllama.so +COPY --from=build /app/lib/ / COPY --from=build /app/build/bin/llama-server /llama-server # Must be set to 0.0.0.0 so it can listen to requests from host machine diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile index 193a6d77c..259877468 100644 --- a/.devops/llama-server-musa.Dockerfile +++ b/.devops/llama-server-musa.Dockerfile @@ -16,15 +16,16 @@ WORKDIR /app COPY . . RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-server -j$(nproc) + cmake --build build --config Release --target llama-server -j$(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl -COPY --from=build /app/build/ggml/src/libggml.so /libggml.so -COPY --from=build /app/build/src/libllama.so /libllama.so +COPY --from=build /app/lib/ / COPY --from=build /app/build/bin/llama-server /llama-server # Must be set to 0.0.0.0 so it can listen to requests from host machine diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 5d7d7ea5a..b88e6ca80 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -126,9 +126,9 @@ effectiveStdenv.mkDerivation (finalAttrs: { }; postPatch = '' - substituteInPlace ./ggml/src/ggml-metal.m \ + substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - substituteInPlace ./ggml/src/ggml-metal.m \ + substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \ --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" ''; @@ -173,7 +173,7 @@ effectiveStdenv.mkDerivation (finalAttrs: { (cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_CUDA" useCuda) - (cmakeBool "GGML_HIPBLAS" useRocm) + (cmakeBool "GGML_HIP" useRocm) (cmakeBool "GGML_METAL" useMetalKit) (cmakeBool "GGML_VULKAN" useVulkan) (cmakeBool "GGML_STATIC" enableStatic) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 02dcee963..d6a7b66a5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -55,7 +55,13 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF .. + cmake .. \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=ON \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DGGML_RPC=ON \ + -DBUILD_SHARED_LIBS=OFF cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -113,7 +119,12 @@ jobs: sysctl -a # Metal is disabled due to intermittent failures with Github runners not having a GPU: # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 - cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF + cmake -B build \ + -DLLAMA_FATAL_WARNINGS=ON \ + -DLLAMA_CURL=ON \ + -DGGML_METAL=OFF \ + -DGGML_RPC=ON \ + -DBUILD_SHARED_LIBS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -394,13 +405,13 @@ jobs: - name: Build with native CMake HIP support id: cmake_build run: | - cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON + cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON cmake --build build --config Release -j $(nproc) - name: Build with legacy HIP support id: cmake_build_legacy_hip run: | - cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON + cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON cmake --build build2 --config Release -j $(nproc) ubuntu-22-cmake-sycl: @@ -569,6 +580,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -599,6 +611,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -734,7 +747,7 @@ jobs: id: clone_kompute if: ${{ matrix.build == 'kompute-x64' }} run: | - git submodule update --init ggml/src/kompute + git submodule update --init ggml/src/ggml-kompute/kompute - name: Download OpenBLAS id: get_openblas @@ -1001,7 +1014,7 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} windows-latest-cmake-hip-release: @@ -1037,7 +1050,7 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" diff --git a/.gitmodules b/.gitmodules index 5861d59cb..23ce5ff05 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "kompute"] - path = ggml/src/kompute + path = ggml/src/ggml-kompute/kompute url = https://github.com/nomic-ai/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index ef0932a7b..93c60ef43 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,7 +140,6 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files") set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files") - # At the moment some compile definitions are placed within the ggml/src # directory but not exported on the `ggml` target. This could be improved by # determining _precisely_ which defines are necessary for the llama-config diff --git a/Makefile b/Makefile index d59111cef..87fe795aa 100644 --- a/Makefile +++ b/Makefile @@ -523,65 +523,54 @@ ifndef GGML_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS + MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif endif # GGML_NO_ACCELERATE -ifdef GGML_MUSA - CC := clang - CXX := clang++ - GGML_CUDA := 1 - MK_CPPFLAGS += -DGGML_USE_MUSA -endif - ifndef GGML_NO_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp - ifdef GGML_MUSA - MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp - MK_LDFLAGS += -L/usr/lib/llvm-10/lib - endif # GGML_MUSA endif # GGML_NO_OPENMP ifdef GGML_OPENBLAS MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas) - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif # GGML_OPENBLAS ifdef GGML_OPENBLAS64 MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) MK_LDFLAGS += $(shell pkg-config --libs openblas64) - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif # GGML_OPENBLAS64 ifdef GGML_BLIS MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif # GGML_BLIS ifdef GGML_NVPL MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas MK_LDFLAGS += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp - OBJ_GGML += ggml/src/ggml-blas.o + OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o endif # GGML_NVPL ifndef GGML_NO_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJ_GGML += ggml/src/llamafile/sgemm.o + OBJ_GGML += ggml/src/ggml-cpu/llamafile/sgemm.o endif ifndef GGML_NO_AMX MK_CPPFLAGS += -DGGML_USE_AMX - OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o + OBJ_GGML += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o endif ifdef GGML_RPC @@ -601,29 +590,17 @@ else endif # GGML_CUDA_FA_ALL_QUANTS ifdef GGML_CUDA - ifdef GGML_MUSA - ifneq ('', '$(wildcard /opt/musa)') - CUDA_PATH ?= /opt/musa - else - CUDA_PATH ?= /usr/local/musa - endif - - MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include - MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64 - MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22 + ifneq ('', '$(wildcard /opt/cuda)') + CUDA_PATH ?= /opt/cuda else - ifneq ('', '$(wildcard /opt/cuda)') - CUDA_PATH ?= /opt/cuda - else - CUDA_PATH ?= /usr/local/cuda - endif + CUDA_PATH ?= /usr/local/cuda + endif - MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include - MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib - MK_NVCCFLAGS += -use_fast_math - endif # GGML_MUSA + MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include + MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib + MK_NVCCFLAGS += -use_fast_math - OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(OBJ_CUDA_TMPL) @@ -631,11 +608,9 @@ ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings endif # LLAMA_FATAL_WARNINGS -ifndef GGML_MUSA ifndef JETSON_EOL_MODULE_DETECT MK_NVCCFLAGS += --forward-unknown-to-host-compiler endif # JETSON_EOL_MODULE_DETECT -endif # GGML_MUSA ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo @@ -648,11 +623,7 @@ endif # GGML_CUDA_DEBUG ifdef GGML_CUDA_NVCC NVCC = $(CCACHE) $(GGML_CUDA_NVCC) else - ifdef GGML_MUSA - NVCC = $(CCACHE) mcc - else - NVCC = $(CCACHE) nvcc - endif # GGML_MUSA + NVCC = $(CCACHE) nvcc endif # GGML_CUDA_NVCC ifdef CUDA_DOCKER_ARCH @@ -724,15 +695,9 @@ define NVCC_COMPILE $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE else - ifdef GGML_MUSA -define NVCC_COMPILE - $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@ -endef # NVCC_COMPILE - else define NVCC_COMPILE $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE - endif # GGML_MUSA endif # JETSON_EOL_MODULE_DETECT ggml/src/ggml-cuda/%.o: \ @@ -742,8 +707,8 @@ ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-cuda/common.cuh $(NVCC_COMPILE) -ggml/src/ggml-cuda.o: \ - ggml/src/ggml-cuda.cu \ +ggml/src/ggml-cuda/ggml-cuda.o: \ + ggml/src/ggml-cuda/ggml-cuda.cu \ ggml/include/ggml-cuda.h \ ggml/include/ggml.h \ ggml/include/ggml-backend.h \ @@ -819,7 +784,7 @@ ifdef GGML_HIPBLAS GGML_CUDA_MMV_Y ?= 1 GGML_CUDA_KQUANTS_ITER ?= 2 - MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA + MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA ifdef GGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA @@ -852,12 +817,12 @@ ifdef GGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY - OBJ_GGML += ggml/src/ggml-cuda.o + OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(OBJ_CUDA_TMPL) -ggml/src/ggml-cuda.o: \ - ggml/src/ggml-cuda.cu \ +ggml/src/ggml-cuda/ggml-cuda.o: \ + ggml/src/ggml-cuda/ggml-cuda.cu \ ggml/include/ggml-cuda.h \ ggml/include/ggml.h \ ggml/include/ggml-backend.h \ @@ -874,6 +839,107 @@ ggml/src/ggml-cuda/%.o: \ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< endif # GGML_HIPBLAS +ifdef GGML_MUSA + ifeq ($(wildcard /opt/musa),) + MUSA_PATH ?= /usr/local/musa + else + MUSA_PATH ?= /opt/musa + endif + MTGPU_TARGETS ?= mp_21 mp_22 + + MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA + MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib + MK_LDFLAGS += -lmusa -lmusart -lmublas + + ifndef GGML_NO_OPENMP + # For Ubuntu Focal + MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp + MK_LDFLAGS += -L/usr/lib/llvm-10/lib + # For Ubuntu Jammy + MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include + MK_LDFLAGS += -L/usr/lib/llvm-14/lib + endif # GGML_NO_OPENMP + + CC := $(MUSA_PATH)/bin/clang + CXX := $(MUSA_PATH)/bin/clang++ + MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc + + MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS)) + +ifdef GGML_CUDA_FORCE_DMMV + MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV +endif # GGML_CUDA_FORCE_DMMV + +ifdef GGML_CUDA_FORCE_MMQ + MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ +endif # GGML_CUDA_FORCE_MMQ + +ifdef GGML_CUDA_FORCE_CUBLAS + MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS +endif # GGML_CUDA_FORCE_CUBLAS + +ifdef GGML_CUDA_DMMV_X + MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X) +else + MUSAFLAGS += -DGGML_CUDA_DMMV_X=32 +endif # GGML_CUDA_DMMV_X + +ifdef GGML_CUDA_MMV_Y + MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y) +else + MUSAFLAGS += -DGGML_CUDA_MMV_Y=1 +endif # GGML_CUDA_MMV_Y + +ifdef GGML_CUDA_F16 + MUSAFLAGS += -DGGML_CUDA_F16 +endif # GGML_CUDA_F16 + +ifdef GGML_CUDA_DMMV_F16 + MUSAFLAGS += -DGGML_CUDA_F16 +endif # GGML_CUDA_DMMV_F16 + +ifdef GGML_CUDA_KQUANTS_ITER + MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER) +else + MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2 +endif + +ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE + MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE) +else + MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 +endif # GGML_CUDA_PEER_MAX_BATCH_SIZE + +ifdef GGML_CUDA_NO_PEER_COPY + MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY +endif # GGML_CUDA_NO_PEER_COPY + +ifdef GGML_CUDA_FA_ALL_QUANTS + MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS +endif # GGML_CUDA_FA_ALL_QUANTS + + OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o + OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) + OBJ_GGML += $(OBJ_CUDA_TMPL) + +ggml/src/ggml-cuda/ggml-cuda.o: \ + ggml/src/ggml-cuda/ggml-cuda.cu \ + ggml/include/ggml-cuda.h \ + ggml/include/ggml.h \ + ggml/include/ggml-backend.h \ + ggml/src/ggml-backend-impl.h \ + ggml/src/ggml-common.h \ + $(wildcard ggml/src/ggml-cuda/*.cuh) + $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $< + +ggml/src/ggml-cuda/%.o: \ + ggml/src/ggml-cuda/%.cu \ + ggml/include/ggml.h \ + ggml/src/ggml-common.h \ + ggml/src/ggml-cuda/common.cuh + $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $< +endif # GGML_MUSA + ifndef GGML_NO_CPU_AARCH64 MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64 endif @@ -881,7 +947,11 @@ endif ifdef GGML_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit - OBJ_GGML += ggml/src/ggml-metal.o + OBJ_GGML += ggml/src/ggml-metal/ggml-metal.o + +ifdef GGML_METAL_USE_BF16 + MK_CPPFLAGS += -DGGML_METAL_USE_BF16 +endif # GGML_METAL_USE_BF16 ifdef GGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif @@ -892,25 +962,25 @@ endif endif # GGML_METAL ifdef GGML_METAL -ggml/src/ggml-metal.o: \ - ggml/src/ggml-metal.m \ +ggml/src/ggml-metal/ggml-metal.o: \ + ggml/src/ggml-metal/ggml-metal.m \ ggml/include/ggml-metal.h \ ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ ifdef GGML_METAL_EMBED_LIBRARY ggml/src/ggml-metal-embed.o: \ - ggml/src/ggml-metal.metal \ + ggml/src/ggml-metal/ggml-metal.metal \ ggml/src/ggml-common.h @echo "Embedding Metal library" - @sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal + @sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal $(eval TEMP_ASSEMBLY=$(shell mktemp -d)) - @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s - @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s $(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@ @rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s @rmdir ${TEMP_ASSEMBLY} @@ -919,11 +989,16 @@ endif # GGML_METAL OBJ_GGML += \ ggml/src/ggml.o \ - ggml/src/ggml-cpu.o \ + ggml/src/ggml-aarch64.o \ ggml/src/ggml-alloc.o \ ggml/src/ggml-backend.o \ + ggml/src/ggml-backend-reg.o \ ggml/src/ggml-quants.o \ - ggml/src/ggml-aarch64.o + ggml/src/ggml-threading.o \ + ggml/src/ggml-cpu/ggml-cpu.o \ + ggml/src/ggml-cpu/ggml-cpu-cpp.o \ + ggml/src/ggml-cpu/ggml-cpu-aarch64.o \ + ggml/src/ggml-cpu/ggml-cpu-quants.o OBJ_LLAMA = \ src/llama.o \ @@ -997,7 +1072,6 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef GGML_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') -ifndef GGML_MUSA ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH @@ -1007,7 +1081,6 @@ endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) -endif # GGML_MUSA endif # GGML_CUDA $(info ) @@ -1051,12 +1124,23 @@ ggml/src/ggml.o: \ ggml/include/ggml.h $(CC) $(CFLAGS) -c $< -o $@ -ggml/src/ggml-cpu.o: \ - ggml/src/ggml-cpu.c \ +ggml/src/ggml-threading.o: \ + ggml/src/ggml-threading.cpp \ + ggml/include/ggml.h + $(CXX) $(XXCFLAGS) -c $< -o $@ + +ggml/src/ggml-cpu/ggml-cpu.o: \ + ggml/src/ggml-cpu/ggml-cpu.c \ ggml/include/ggml.h \ ggml/src/ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ +ggml/src/ggml-cpu/ggml-cpu-cpp.o: \ + ggml/src/ggml-cpu/ggml-cpu.cpp \ + ggml/include/ggml.h \ + ggml/src/ggml-common.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + ggml/src/ggml-alloc.o: \ ggml/src/ggml-alloc.c \ ggml/include/ggml.h \ @@ -1084,22 +1168,22 @@ ggml/src/ggml-aarch64.o: \ ggml/src/ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ -ggml/src/ggml-blas.o: \ - ggml/src/ggml-blas.cpp \ +ggml/src/ggml-blas/ggml-blas.o: \ + ggml/src/ggml-blas/ggml-blas.cpp \ ggml/include/ggml-blas.h $(CXX) $(CXXFLAGS) -c $< -o $@ ifndef GGML_NO_LLAMAFILE -ggml/src/llamafile/sgemm.o: \ - ggml/src/llamafile/sgemm.cpp \ - ggml/src/llamafile/sgemm.h \ +ggml/src/ggml-cpu/llamafile/sgemm.o: \ + ggml/src/ggml-cpu/llamafile/sgemm.cpp \ + ggml/src/ggml-cpu/llamafile/sgemm.h \ ggml/include/ggml.h - $(CXX) $(CXXFLAGS) -c $< -o $@ + $(CXX) $(CXXFLAGS) -c $< -o $@ -I ggml/src -I ggml/src/ggml-cpu endif # GGML_NO_LLAMAFILE ifndef GGML_NO_AMX -ggml/src/ggml-amx.o: \ - ggml/src/ggml-amx.cpp \ +ggml/src/ggml-amx/ggml-amx.o: \ + ggml/src/ggml-amx/ggml-amx.cpp \ ggml/include/ggml-amx.h $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -1250,13 +1334,24 @@ clean: rm -rvf ggml/*.a rm -rvf ggml/*.dll rm -rvf ggml/*.so - rm -vrf ggml/src/*.o - rm -rvf ggml/src/llamafile/*.o + rm -rvf ggml/src/*.o rm -rvf common/build-info.cpp - rm -vrf ggml/src/ggml-metal-embed.metal + rm -rvf ggml/src/ggml-cpu/*.o + rm -rvf ggml/src/ggml-cpu/llamafile/*.o + rm -vrf ggml/src/ggml-amx/*.o + rm -vrf ggml/src/ggml-blas/*.o + rm -vrf ggml/src/ggml-cann/*.o + rm -vrf ggml/src/ggml-cpu/*.o rm -vrf ggml/src/ggml-cuda/*.o rm -vrf ggml/src/ggml-cuda/template-instances/*.o - rm -vrf ggml/src/ggml-amx/*.o + rm -vrf ggml/src/ggml-hip/*.o + rm -vrf ggml/src/ggml-kompute/*.o + rm -vrf ggml/src/ggml-metal/*.o + rm -vrf ggml/src/ggml-metal/ggml-metal-embed.metal + rm -vrf ggml/src/ggml-rpc/*.o + rm -vrf ggml/src/ggml-sycl/*.o + rm -vrf ggml/src/ggml-vulkan/*.o + rm -vrf ggml/src/ggml-musa/*.o rm -rvf $(BUILD_TARGETS) rm -rvf $(TEST_TARGETS) rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp diff --git a/Package.swift b/Package.swift index 0f4f19018..6b68aecde 100644 --- a/Package.swift +++ b/Package.swift @@ -10,11 +10,16 @@ var sources = [ "src/unicode.cpp", "src/unicode-data.cpp", "ggml/src/ggml.c", - "ggml/src/ggml-cpu.c", + "ggml/src/ggml-aarch64.c", "ggml/src/ggml-alloc.c", "ggml/src/ggml-backend.cpp", + "ggml/src/ggml-backend-reg.cpp", + "ggml/src/ggml-cpu/ggml-cpu.c", + "ggml/src/ggml-cpu/ggml-cpu.cpp", + "ggml/src/ggml-cpu/ggml-cpu-aarch64.c", + "ggml/src/ggml-cpu/ggml-cpu-quants.c", + "ggml/src/ggml-threading.cpp", "ggml/src/ggml-quants.c", - "ggml/src/ggml-aarch64.c", ] var resources: [Resource] = [] @@ -22,6 +27,7 @@ var linkerSettings: [LinkerSetting] = [] var cSettings: [CSetting] = [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), .unsafeFlags(["-fno-objc-arc"]), + .headerSearchPath("ggml/src"), // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) @@ -30,8 +36,9 @@ var cSettings: [CSetting] = [ ] #if canImport(Darwin) -sources.append("ggml/src/ggml-metal.m") -resources.append(.process("ggml/src/ggml-metal.metal")) +sources.append("ggml/src/ggml-common.h") +sources.append("ggml/src/ggml-metal/ggml-metal.m") +resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal")) linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ diff --git a/README.md b/README.md index 0378a674e..6ab6acf12 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well. - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) +- Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama) - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) diff --git a/ci/run.sh b/ci/run.sh index 21b62dd1e..20610e560 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -39,7 +39,7 @@ SRC=`pwd` CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" if [ ! -z ${GG_BUILD_METAL} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON" fi if [ ! -z ${GG_BUILD_CUDA} ]; then diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index f072b76a3..ef68417b4 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -6,7 +6,7 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(GGML_BLAS @GGML_BLAS@) set(GGML_CUDA @GGML_CUDA@) set(GGML_METAL @GGML_METAL@) -set(GGML_HIPBLAS @GGML_HIPBLAS@) +set(GGML_HIP @GGML_HIP@) set(GGML_ACCELERATE @GGML_ACCELERATE@) set(GGML_VULKAN @GGML_VULKAN@) set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) diff --git a/common/common.cpp b/common/common.cpp index 19674af15..ebd16b600 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1967,18 +1967,13 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); - fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); - fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); - fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false"); fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false"); fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false"); fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false"); fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 727f85baa..6289feaeb 100644 --- a/common/common.h +++ b/common/common.h @@ -178,7 +178,7 @@ struct common_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - float defrag_thold = -1.0f; // KV cache defragmentation threshold + float defrag_thold = 0.1f; // KV cache defragmentation threshold struct cpu_params cpuparams; struct cpu_params cpuparams_batch; diff --git a/docs/build.md b/docs/build.md index 4e362ebc7..95512415a 100644 --- a/docs/build.md +++ b/docs/build.md @@ -230,7 +230,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -- -j 16 ``` On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. @@ -247,7 +247,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \ HIP_DEVICE_LIB_PATH= \ - cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build -- -j 16 ``` @@ -259,7 +259,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% - cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release + cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release cmake --build build ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index d9cab9836..9d761ebb8 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin" NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt" NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin" -SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+' -SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' +SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\ +'|'\ +'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d" CTX_SIZE=2048 @@ -129,15 +130,12 @@ while read -e line; do printf ' ' - # HACK get num tokens from debug message - # TODO get both messages in one go - if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || - ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then + if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then echo >&2 "Couldn't get number of tokens from ./llama-cli output!" exit 1 fi - n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg"))) + n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")") if ((n_tokens > CTX_ROTATE_POINT)); then tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE" diff --git a/examples/convert_legacy_llama.py b/examples/convert_legacy_llama.py index 9ab9ab06e..c4ec5c524 100755 --- a/examples/convert_legacy_llama.py +++ b/examples/convert_legacy_llama.py @@ -840,6 +840,8 @@ class OutputFile: self.gguf.add_base_model_version(key, base_model_entry["version"]) if "organization" in base_model_entry: self.gguf.add_base_model_organization(key, base_model_entry["organization"]) + if "description" in base_model_entry: + self.gguf.add_base_model_description(key, base_model_entry["description"]) if "url" in base_model_entry: self.gguf.add_base_model_url(key, base_model_entry["url"]) if "doi" in base_model_entry: @@ -849,12 +851,32 @@ class OutputFile: if "repo_url" in base_model_entry: self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"]) + if metadata.datasets is not None: + self.gguf.add_dataset_count(len(metadata.datasets)) + for key, dataset_entry in enumerate(metadata.datasets): + if "name" in dataset_entry: + self.gguf.add_dataset_name(key, dataset_entry["name"]) + if "author" in dataset_entry: + self.gguf.add_dataset_author(key, dataset_entry["author"]) + if "version" in dataset_entry: + self.gguf.add_dataset_version(key, dataset_entry["version"]) + if "organization" in dataset_entry: + self.gguf.add_dataset_organization(key, dataset_entry["organization"]) + if "description" in dataset_entry: + self.gguf.add_dataset_description(key, dataset_entry["description"]) + if "url" in dataset_entry: + self.gguf.add_dataset_url(key, dataset_entry["url"]) + if "doi" in dataset_entry: + self.gguf.add_dataset_doi(key, dataset_entry["doi"]) + if "uuid" in dataset_entry: + self.gguf.add_dataset_uuid(key, dataset_entry["uuid"]) + if "repo_url" in dataset_entry: + self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"]) + if metadata.tags is not None: self.gguf.add_tags(metadata.tags) if metadata.languages is not None: self.gguf.add_languages(metadata.languages) - if metadata.datasets is not None: - self.gguf.add_datasets(metadata.datasets) def add_meta_arch(self, params: Params) -> None: # Metadata About The Neural Architecture Itself diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 1eddfd0db..8f4e0e206 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -774,13 +774,6 @@ static std::vector get_cmd_params_instances(const cmd_param struct test { static const std::string build_commit; static const int build_number; - static const bool cuda; - static const bool vulkan; - static const bool kompute; - static const bool metal; - static const bool sycl; - static const bool gpu_blas; - static const bool blas; static const std::string cpu_info; static const std::string gpu_info; std::string model_filename; @@ -793,7 +786,6 @@ struct test { std::string cpu_mask; bool cpu_strict; int poll; - bool has_rpc; ggml_type type_k; ggml_type type_v; int n_gpu_layers; @@ -822,7 +814,6 @@ struct test { cpu_mask = inst.cpu_mask; cpu_strict = inst.cpu_strict; poll = inst.poll; - has_rpc = !inst.rpc_servers.empty(); type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; @@ -881,8 +872,7 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", - "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas", - "cpu_info", "gpu_info", + "cpu_info", "gpu_info", "backends", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", @@ -908,8 +898,7 @@ struct test { field == "avg_ns" || field == "stddev_ns") { return INT; } - if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || - field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || field == "use_mmap" || field == "embeddings") { return BOOL; @@ -938,9 +927,7 @@ struct test { } std::vector values = { build_commit, std::to_string(build_number), - std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan), - std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas), - cpu_info, gpu_info, + cpu_info, gpu_info, get_backend(), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll), @@ -967,13 +954,6 @@ struct test { const std::string test::build_commit = LLAMA_COMMIT; const int test::build_number = LLAMA_BUILD_NUMBER; -const bool test::cuda = !!ggml_cpu_has_cuda(); -const bool test::vulkan = !!ggml_cpu_has_vulkan(); -const bool test::kompute = !!ggml_cpu_has_kompute(); -const bool test::metal = !!ggml_cpu_has_metal(); -const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); -const bool test::blas = !!ggml_cpu_has_blas(); -const bool test::sycl = !!ggml_cpu_has_sycl(); const std::string test::cpu_info = get_cpu_info(); const std::string test::gpu_info = get_gpu_info(); @@ -1178,7 +1158,8 @@ struct markdown_printer : public printer { fields.emplace_back("size"); fields.emplace_back("params"); fields.emplace_back("backend"); - bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; + bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos || + test::get_backend().find("BLAS") != std::string::npos; if (!is_cpu_backend) { fields.emplace_back("n_gpu_layers"); } @@ -1268,9 +1249,6 @@ struct markdown_printer : public printer { value = buf; } else if (field == "backend") { value = test::get_backend(); - if (t.has_rpc) { - value += "+RPC"; - } } else if (field == "test") { if (t.n_prompt > 0 && t.n_gen == 0) { snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index e372856c6..912caf346 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { } static void test_roundtrip_on_chunk( - const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference, + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats ) { if (layer->type == GGML_TYPE_F16) { @@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk( if (use_reference) { qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size); } else { - qfns.from_float(input_scratch, quantized_scratch, chunk_size); + qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size); } qfns.to_float(quantized_scratch, output_scratch, chunk_size); @@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats static void test_roundtrip_on_layer( - std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference, + std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, std::vector & output_scratch, error_stats & total_error, int max_thread = 0 ) { @@ -187,13 +187,13 @@ static void test_roundtrip_on_layer( int num_chunks = (nelements + chunk_size - 1)/chunk_size; if (num_chunks < 2 || max_thread < 2) { - test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(), + test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(), output_scratch.data(), print_layer_stats ? layer_error : total_error); } else { auto & stats = print_layer_stats ? layer_error : total_error; std::mutex mutex; uint64_t counter = 0; - auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr, + auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr, &quantized_scratch, &output_scratch, chunk_size] () { error_stats local_stats {}; while (true) { @@ -205,7 +205,7 @@ static void test_roundtrip_on_layer( } lock.unlock(); uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; - test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset, + test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset, quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); } }; @@ -371,8 +371,9 @@ int main(int argc, char ** argv) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } - const auto * qfns = ggml_get_type_traits(type); - if (qfns->from_float && qfns->to_float) { + const auto * qfns = ggml_get_type_traits(type); + const auto * qfns_cpu = ggml_get_type_traits_cpu(type); + if (qfns_cpu->from_float && qfns->to_float) { if (params.verbose) { printf("testing %s ...\n", ggml_type_name(type)); } @@ -393,7 +394,7 @@ int main(int argc, char ** argv) { test_roundtrip_on_layer( layer_name, params.per_layer_stats, - *qfns, + *qfns, *qfns_cpu, params.reference, kv_tensor.second, input_scratch, diff --git a/examples/server/README.md b/examples/server/README.md index 562494077..6f72c6bb8 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -39,7 +39,7 @@ The project is under active development, and we are [looking for feedback and co | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | | `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
| | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | -| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | +| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
(env: LLAMA_ARG_N_PREDICT) | | `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | @@ -64,7 +64,7 @@ The project is under active development, and we are [looking for feedback and co | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | -| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | +| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_NO_MMAP) | @@ -99,25 +99,27 @@ The project is under active development, and we are [looking for feedback and co | Argument | Explanation | | -------- | ----------- | -| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: top_k;typ_p;top_p;min_p;temperature) | +| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'
(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) | | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | -| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) | +| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | | `--penalize-nl` | penalize newline tokens (default: false) | | `--temp N` | temperature (default: 0.8) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled) | | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | | `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | | `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | -| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) | -| `--dry-base N` | DRY sampling base value (default: 1.75) | -| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) | -| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | -| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) | +| `--dry-base N` | set DRY sampling base value (default: 1.75) | +| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | +| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | +| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers
| | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | | `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | @@ -381,6 +383,10 @@ node index.js `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']` + `xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled. + + `xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC) + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0` @@ -409,7 +415,7 @@ node index.js `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` - `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. + `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. **Response format** diff --git a/examples/server/public/index.html b/examples/server/public/index.html index bf1d1b794..55639a944 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -200,23 +200,38 @@
System Message
-