diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index 77a9ddc14..8cc1480d3 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -26,8 +26,8 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 RUN make diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec similarity index 81% rename from .devops/llama-cpp-cublas.srpm.spec rename to .devops/llama-cpp-cuda.srpm.spec index f847ebb1e..66bdc871e 100644 --- a/.devops/llama-cpp-cublas.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -12,7 +12,7 @@ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # It is up to the user to install the correct vendor-specific support. -Name: llama.cpp-cublas +Name: llama.cpp-cuda Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) @@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options. %setup -n llama.cpp-master %build -make -j LLAMA_CUBLAS=1 +make -j LLAMA_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llamacppcublas -cp -p server %{buildroot}%{_bindir}/llamacppcublasserver -cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple +cp -p main %{buildroot}%{_bindir}/llamacppcuda +cp -p server %{buildroot}%{_bindir}/llamacppcudaserver +cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple mkdir -p %{buildroot}/usr/lib/systemd/system -%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacublas.service +%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacuda.service [Unit] Description=Llama.cpp server, CPU only (no GPU support in this build). After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target @@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t [Service] Type=simple EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS +ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -67,10 +67,10 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llamacppcublas -%{_bindir}/llamacppcublasserver -%{_bindir}/llamacppcublassimple -/usr/lib/systemd/system/llamacublas.service +%{_bindir}/llamacppcuda +%{_bindir}/llamacppcudaserver +%{_bindir}/llamacppcudasimple +/usr/lib/systemd/system/llamacuda.service %config /etc/sysconfig/llama %pre diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index 2b7faf7c1..b937a4829 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -20,8 +20,8 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 RUN make diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 76d96e63c..b651f9e61 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -4,6 +4,7 @@ config, stdenv, mkShell, + runCommand, cmake, ninja, pkg-config, @@ -35,7 +36,8 @@ # It's necessary to consistently use backendStdenv when building with CUDA support, # otherwise we get libstdc++ errors downstream. effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv, - enableStatic ? effectiveStdenv.hostPlatform.isStatic + enableStatic ? effectiveStdenv.hostPlatform.isStatic, + precompileMetalShaders ? false }@inputs: let @@ -87,6 +89,11 @@ let ] ); + xcrunHost = runCommand "xcrunHost" {} '' + mkdir -p $out/bin + ln -s /usr/bin/xcrun $out/bin + ''; + # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64 # separately darwinBuildInputs = @@ -150,6 +157,8 @@ effectiveStdenv.mkDerivation ( postPatch = '' substituteInPlace ./ggml-metal.m \ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" # TODO: Package up each Python script or service appropriately. # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`, @@ -157,6 +166,14 @@ effectiveStdenv.mkDerivation ( substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python" ''; + # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, + # `default.metallib` may be compiled with Metal compiler from XCode + # and we need to escape sandbox on MacOS to access Metal compiler. + # `xcrun` is used find the path of the Metal compiler, which is varible + # and not on $PATH + # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion + __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; + nativeBuildInputs = [ cmake @@ -173,6 +190,8 @@ effectiveStdenv.mkDerivation ( ] ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static + ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ + xcrunHost ]; buildInputs = @@ -192,7 +211,7 @@ effectiveStdenv.mkDerivation ( (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "LLAMA_BLAS" useBlas) (cmakeBool "LLAMA_CLBLAST" useOpenCL) - (cmakeBool "LLAMA_CUBLAS" useCuda) + (cmakeBool "LLAMA_CUDA" useCuda) (cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_MPI" useMpi) @@ -217,7 +236,10 @@ effectiveStdenv.mkDerivation ( # Should likely use `rocmPackages.clr.gpuTargets`. "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" ] - ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]; + ++ optionals useMetalKit [ + (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") + (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) + ]; # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # if they haven't been added yet. diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile index 4f83904bc..5683a3646 100644 --- a/.devops/server-cuda.Dockerfile +++ b/.devops/server-cuda.Dockerfile @@ -20,8 +20,8 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 RUN make diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0e7643bba..9329b94ee 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -728,13 +728,13 @@ jobs: path: | llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip - windows-latest-cmake-cublas: + windows-latest-cmake-cuda: runs-on: windows-latest strategy: matrix: cuda: ['12.2.0', '11.7.1'] - build: ['cublas'] + build: ['cuda'] steps: - name: Clone @@ -755,7 +755,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON + cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name @@ -911,7 +911,7 @@ jobs: - macOS-latest-make - macOS-latest-cmake - windows-latest-cmake - - windows-latest-cmake-cublas + - windows-latest-cmake-cuda - macOS-latest-cmake-arm64 - macOS-latest-cmake-x64 diff --git a/CMakeLists.txt b/CMakeLists.txt index b25cfd2fc..ed1e77645 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,8 +89,8 @@ endif() option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -option(LLAMA_CUBLAS "llama: use CUDA" OFF) -#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) +option(LLAMA_CUDA "llama: use CUDA" OFF) +option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") @@ -360,11 +360,16 @@ if (LLAMA_QKK_64) endif() if (LLAMA_CUBLAS) + message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") + set(LLAMA_CUDA ON) +endif() + +if (LLAMA_CUDA) cmake_minimum_required(VERSION 3.17) find_package(CUDAToolkit) if (CUDAToolkit_FOUND) - message(STATUS "cuBLAS found") + message(STATUS "CUDA found") enable_language(CUDA) @@ -373,7 +378,7 @@ if (LLAMA_CUBLAS) file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") - add_compile_definitions(GGML_USE_CUBLAS) + add_compile_definitions(GGML_USE_CUDA) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() @@ -422,7 +427,7 @@ if (LLAMA_CUBLAS) message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") else() - message(WARNING "cuBLAS not found") + message(WARNING "CUDA not found") endif() endif() @@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS) file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) if (LLAMA_HIP_UMA) add_compile_definitions(GGML_HIP_UMA) @@ -830,7 +835,7 @@ endif() set(CUDA_CXX_FLAGS "") -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) set(CUDA_FLAGS -use_fast_math) if (LLAMA_FATAL_WARNINGS) @@ -1055,7 +1060,7 @@ endif() add_compile_options("$<$:${ARCH_FLAGS}>") add_compile_options("$<$:${ARCH_FLAGS}>") -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") @@ -1260,6 +1265,12 @@ if (LLAMA_METAL) GROUP_READ WORLD_READ DESTINATION ${CMAKE_INSTALL_BINDIR}) + if (NOT LLAMA_METAL_EMBED_LIBRARY) + install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + endif() endif() # diff --git a/Makefile b/Makefile index 08eafb1e7..1741151eb 100644 --- a/Makefile +++ b/Makefile @@ -390,12 +390,17 @@ ifdef LLAMA_BLIS endif # LLAMA_BLIS ifdef LLAMA_CUBLAS +# LLAMA_CUBLAS is deprecated and will be removed in the future + LLAMA_CUDA := 1 +endif + +ifdef LLAMA_CUDA ifneq ('', '$(wildcard /opt/cuda)') CUDA_PATH ?= /opt/cuda else CUDA_PATH ?= /usr/local/cuda endif - MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib OBJS += ggml-cuda.o OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) @@ -462,7 +467,7 @@ endif ifdef JETSON_EOL_MODULE_DETECT define NVCC_COMPILE - $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ + $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ endef # NVCC_COMPILE else define NVCC_COMPILE @@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) $(NVCC_COMPILE) -endif # LLAMA_CUBLAS +endif # LLAMA_CUDA ifdef LLAMA_CLBLAST @@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 - MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS + MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA ifdef LLAMA_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA endif # LLAMA_HIP_UMA @@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) # identify CUDA host compiler -ifdef LLAMA_CUBLAS +ifdef LLAMA_CUDA GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler include scripts/get-flags.mk CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic @@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS)) $(info I LDFLAGS: $(LDFLAGS)) $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) -ifdef LLAMA_CUBLAS +ifdef LLAMA_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) @@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) -endif # LLAMA_CUBLAS +endif # LLAMA_CUDA $(info ) +ifdef LLAMA_CUBLAS +$(info !!!!) +$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.) +$(info !!!!) +$(info ) +endif + # # Build library # diff --git a/README.md b/README.md index f9cf19616..a56a60049 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Recent API changes +- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 @@ -448,30 +449,27 @@ Building the program with BLAS support may lead to some performance improvements Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. -- #### cuBLAS +- #### CUDA - This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). + This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling. - Using `make`: ```bash - make LLAMA_CUBLAS=1 + make LLAMA_CUDA=1 ``` - Using `CMake`: ```bash mkdir build cd build - cmake .. -DLLAMA_CUBLAS=ON + cmake .. -DLLAMA_CUDA=ON cmake --build . --config Release ``` The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: - | Option | Legal values | Default | Description | |--------------------------------|------------------------|---------|-------------| | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | @@ -633,6 +631,15 @@ Building the program with BLAS support may lead to some performance improvements - #### Vulkan +> [!WARNING] +> +> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122 +> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend, +> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155 +> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)). +> +> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806 + **With docker**: You don't need to install Vulkan SDK. It will be installed inside the container. diff --git a/ci/run.sh b/ci/run.sh index 51f4c74cc..85acc46d3 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then fi if [ ! -z ${GG_BUILD_CUDA} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1" fi if [ ! -z ${GG_BUILD_SYCL} ]; then @@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert.py ${path_models} diff --git a/common/common.cpp b/common/common.cpp index 9dec08430..5fd33e2a1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -48,12 +48,12 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) -#define GGML_USE_CUBLAS_SYCL +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) +#define GGML_USE_CUDA_SYCL #endif -#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) -#define GGML_USE_CUBLAS_SYCL_VULKAN +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) +#define GGML_USE_CUDA_SYCL_VULKAN #endif #if defined(LLAMA_USE_CURL) @@ -861,9 +861,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL return true; } if (arg == "--split-mode" || arg == "-sm") { @@ -889,9 +889,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL return true; } if (arg == "--tensor-split" || arg == "-ts") { @@ -917,9 +917,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.tensor_split[i] = 0.0f; } } -#ifndef GGML_USE_CUBLAS_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN return true; } if (arg == "--no-mmap") { @@ -2387,7 +2387,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 723ea18e3..c5d2d0b78 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -331,7 +331,7 @@ class Model(ABC): tokenizer = SentencePieceProcessor(str(tokenizer_path)) vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - for token_id in range(vocab_size): + for token_id in range(tokenizer.vocab_size()): piece = tokenizer.id_to_piece(token_id) text = piece.encode("utf-8") score = tokenizer.get_score(token_id) @@ -356,9 +356,13 @@ class Model(ABC): added_tokens_json = json.load(f) for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + key = key.encode("utf-8") + if key not in tokens: + tokens.append(key) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + assert len(tokens) == vocab_size self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_token_list(tokens) diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md index d7e863dff..3c4343147 100644 --- a/docs/token_generation_performance_tips.md +++ b/docs/token_generation_performance_tips.md @@ -1,7 +1,7 @@ # Token generation performance troubleshooting -## Verifying that the model is running on the GPU with cuBLAS -Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: +## Verifying that the model is running on the GPU with CUDA +Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index cbf9aa2b5..9aede7fad 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -61,6 +61,8 @@ int main(int argc, char ** argv) { } params.embedding = true; + // For non-causal models, batch size must be equal to ubatch size + params.n_ubatch = params.n_batch; print_build_info(); @@ -114,7 +116,9 @@ int main(int argc, char ** argv) { for (const auto & prompt : prompts) { auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { - inp.resize(n_batch); + fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + __func__, (long long int) inp.size(), (long long int) n_batch); + return 1; } inputs.push_back(inp); } diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 578e8fc27..458c01b87 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument ## Example ```bash -LLAMA_CUBLAS=1 make -j +LLAMA_CUDA=1 make -j # generate importance matrix (imatrix.dat) ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 264e73f4e..12d34462b 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -424,6 +424,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } + // TODO: use batch.logits to save computations instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 82413b79d..27e113203 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -113,7 +113,7 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::string id; -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA int count = ggml_backend_cuda_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; @@ -808,7 +808,7 @@ struct test { const std::string test::build_commit = LLAMA_COMMIT; const int test::build_number = LLAMA_BUILD_NUMBER; -const bool test::cuda = !!ggml_cpu_has_cublas(); +const bool test::cuda = !!ggml_cpu_has_cuda(); const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::vulkan = !!ggml_cpu_has_vulkan(); const bool test::kompute = !!ggml_cpu_has_kompute(); diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 4d5fef020..b3b66331f 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -124,7 +124,7 @@ llama_print_timings: total time = 34570.79 ms ## Orin compile and run ### compile ```sh -make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 +make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 ``` ### run on Orin diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 48caafa87..40c976261 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -7,7 +7,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif @@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA new_clip->backend = ggml_backend_cuda_init(0); printf("%s: CLIP using CUDA backend\n", __func__); #endif diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md index 6d665f28f..f599fbaec 100644 --- a/examples/main-cmake-pkg/README.md +++ b/examples/main-cmake-pkg/README.md @@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b ### Considerations -When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. +When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. ### Build llama.cpp and install to C:\LlamaCPP directory diff --git a/examples/main/README.md b/examples/main/README.md index 6a8d1e1c5..9c83fd3bf 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -316,8 +316,8 @@ These options provide extra functionality and customization when running the LLa - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `--verbose-prompt`: Print the prompt before generating text. -- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index a2ef0fb03..f66c91013 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -132,7 +132,6 @@ int main(int argc, char ** argv) { llama_context * ctx = NULL; // load the target model - params.logits_all = true; std::tie(model, ctx) = llama_init_from_gpt_params(params); // load the prompts from an external file if there are any diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index d766aef6a..c70385c62 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -380,6 +380,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const int batch_size = std::min(end - batch_start, n_batch); //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); + // TODO: use llama_batch.logits instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { //fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; @@ -552,6 +553,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); + int n_outputs = 0; + batch.n_tokens = 0; for (int seq = 0; seq < n_seq_batch; seq++) { int seq_start = batch_start + seq*n_ctx; @@ -566,11 +569,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par for (int k = 0; k < batch_size; ++k) { const int idx = seq*n_ctx + k; - batch.token[idx] = tokens[seq_start + k]; - batch.pos[idx] = j*n_batch + k; - batch.n_seq_id[idx] = 1; - batch.seq_id[idx][0] = seq; - batch.logits[idx] = batch.pos[idx] >= first ? 1 : 0; + batch.token [idx] = tokens[seq_start + k]; + batch.pos [idx] = j*n_batch + k; + batch.n_seq_id[idx] = 1; + batch.seq_id [idx][0] = seq; + batch.logits [idx] = batch.pos[idx] >= first ? 1 : 0; + + n_outputs += batch.logits[idx] != 0; } batch.n_tokens += batch_size; @@ -583,9 +588,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par return {tokens, -1, logit_history, prob_history}; } - if (num_batches > 1) { + if (num_batches > 1 && n_outputs > 0) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab); } } @@ -604,14 +609,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } for (int seq = 0; seq < n_seq_batch; seq++) { - const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx); + const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first); + llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first; if (!params.logits_file.empty()) { - process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, + process_logits(logits_stream, n_vocab, all_logits, tokens_data, n_ctx - 1 - first, workers, log_probs, nll, nll2); } else { - process_logits(n_vocab, all_logits + first*n_vocab, + process_logits(n_vocab, all_logits, tokens_data, n_ctx - 1 - first, workers, nll, nll2, logit_history.data() + start + seq*n_ctx + first, @@ -652,6 +658,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int32_t n_batch, int32_t n_vocab) { + int prev_outputs = 0; for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); @@ -672,7 +679,14 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< return false; } - memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float)); + int n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + n_outputs += batch_view.logits[i] != 0; + } + + memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float)); + + prev_outputs += n_outputs; } return true; @@ -779,7 +793,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t ending_logprob_count[4]; double ending_logprob[4]; - size_t i_batch; // starting index in the llama_batch + size_t i_logits; // starting index of logits in the llama_batch size_t common_prefix; // max number of initial tokens that are the same in all sentences size_t required_tokens; // needed number of tokens to evaluate all 4 endings std::vector seq_tokens[4]; @@ -844,9 +858,10 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); - llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); + llama_batch batch = llama_batch_init(n_ctx, 0, 4); std::vector tok_logits(n_vocab); + // TODO: this could be made smaller; it's currently the worst-case size std::vector batch_logits(n_vocab*n_ctx); std::vector> eval_pairs; @@ -857,16 +872,17 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { int n_cur = 0; size_t i1 = i0; - size_t i_batch = 0; // this tells us where in `llama_batch` we are currently + size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch llama_batch_clear(batch); // batch as much tasks as possible into the available context - // each task has 4 unique seuqnce ids - one for each ending + // each task has 4 unique sequence ids - one for each ending // the common prefix is shared among the 4 sequences to save tokens // we extract logits only from the last common token and from all ending tokens of each sequence while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) { auto & hs_cur = hs_data[i1]; + int n_logits = 0; const int s0 = 4*(i1 - i0); if (s0 + 4 > max_seq) { @@ -874,18 +890,23 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } for (size_t i = 0; i < hs_cur.common_prefix; ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); + llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix + n_logits += 1; for (int s = 0; s < 4; ++s) { - for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true); + const size_t seq_tokens_size = hs_cur.seq_tokens[s].size(); + // TODO: don't evaluate the last token of each sequence + for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { + const bool needs_logits = i < seq_tokens_size - 1; + llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); + n_logits += needs_logits; } } - hs_cur.i_batch = i_batch; - i_batch += hs_cur.required_tokens; + hs_cur.i_logits = i_logits; + i_logits += n_logits; n_cur += hs_data[i1].required_tokens; if (++i1 == hs_task_count) { @@ -911,12 +932,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { eval_pairs.clear(); for (size_t i = i0; i < i1; ++i) { auto & hs_cur = hs_data[i]; - size_t li = hs_cur.common_prefix; + size_t li = 1; // skip the last logit of the common prefix (computed separately below) for (int s = 0; s < 4; ++s) { for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) { - eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]); + eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]); } - ++li; } } // Then we do the actual calculation @@ -928,7 +948,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { for (size_t i = i0; i < i1; ++i) { auto & hs_cur = hs_data[i]; - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float)); + // get the logits of the last token of the common prefix + std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -978,7 +999,7 @@ struct winogrande_entry { std::array choices; int answer; - size_t i_batch; + size_t i_logits; size_t common_prefix; size_t required_tokens; size_t n_base1; // number of tokens for context + choice 1 @@ -1104,6 +1125,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { task.common_prefix++; } + // TODO: the last token of each of the sequences don't need to be evaluated task.required_tokens = task.common_prefix + task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[1].size() - task.common_prefix; @@ -1121,9 +1143,10 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { const int max_tasks_per_batch = 128; const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); - llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); + llama_batch batch = llama_batch_init(n_ctx, 0, 2); std::vector tok_logits(n_vocab); + // TODO: this could be made smaller; it's currently the worst-case size std::vector batch_logits(n_vocab*n_ctx); std::vector> eval_pairs; @@ -1137,29 +1160,33 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { int n_cur = 0; size_t i1 = i0; - size_t i_batch = 0; + size_t i_logits = 0; llama_batch_clear(batch); while (n_cur + (int) data[i1].required_tokens <= n_ctx) { + int n_logits = 0; const int s0 = 2*(i1 - i0); if (s0 + 2 > max_seq) { break; } for (size_t i = 0; i < data[i1].common_prefix; ++i) { - llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false); + llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); } batch.logits[batch.n_tokens - 1] = true; + n_logits += 1; for (int s = 0; s < 2; ++s) { + // TODO: end before the last token, no need to predict past the end of the sequences for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) { llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); + n_logits += 1; } } - data[i1].i_batch = i_batch; - i_batch += data[i1].required_tokens; + data[i1].i_logits = i_logits; + i_logits += n_logits; n_cur += data[i1].required_tokens; if (++i1 == data.size()) { @@ -1190,15 +1217,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix; const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0; - size_t li = n_base1 - 1; + size_t li = n_base1 - task.common_prefix; for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) { - eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]); + eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]); } const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix; const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0; - li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1; + // FIXME: this uses the wrong first logits when not skipping the choice word + li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix; for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) { - eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]); + eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]); } } compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results); @@ -1287,7 +1315,7 @@ struct multiple_choice_task { } // For evaluation - size_t i_batch; // starting index in the llama_batch + size_t i_logits; // starting index of logits in the llama_batch size_t common_prefix; // max number of initial tokens that are the same in all sentences size_t required_tokens; // needed number of tokens to evaluate all answers std::vector> seq_tokens; @@ -1366,7 +1394,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params std::vector task_pos(n_task); strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t)); if (strstream.fail()) { - printf("%s: failed to raad task positions from prompt\n", __func__); + printf("%s: failed to read task positions from prompt\n", __func__); return; } @@ -1447,7 +1475,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params return; } } else { - int n_dot = n_task/100; + int n_dot = std::max((int) n_task/100, 1); int i_task = 0; for (auto& task : tasks) { ++i_task; @@ -1491,17 +1519,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params int n_cur = 0; size_t i1 = i0; - size_t i_batch = 0; // this tells us where in `llama_batch` we are currently + size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch llama_batch_clear(batch); // batch as much tasks as possible into the available context - // each task has 4 unique seuqnce ids - one for each ending + // each task has 4 unique sequence ids - one for each ending // the common prefix is shared among the 4 sequences to save tokens // we extract logits only from the last common token and from all ending tokens of each sequence int s0 = 0; while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) { auto& cur_task = tasks[i1]; + int n_logits = 0; int num_answers = cur_task.seq_tokens.size(); if (s0 + num_answers > max_seq) { @@ -1518,17 +1547,22 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix + n_logits += 1; for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { - for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) { - llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true); + const size_t seq_tokens_size = cur_task.seq_tokens[s].size(); + // TODO: don't evaluate the last token of each sequence + for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { + const bool needs_logits = i < seq_tokens_size - 1; + llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); + n_logits += needs_logits; } } s0 += num_answers; - cur_task.i_batch = i_batch; - i_batch += cur_task.required_tokens; + cur_task.i_logits = i_logits; + i_logits += n_logits; n_cur += cur_task.required_tokens; if (++i1 == tasks.size()) { @@ -1554,12 +1588,11 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params eval_pairs.clear(); for (size_t i = i0; i < i1; ++i) { auto& cur_task = tasks[i]; - size_t li = cur_task.common_prefix; + size_t li = 1; // skip the last logit of the common prefix (computed separately below) for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { - eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]); + eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]); } - ++li; } } // Then we do the actual calculation @@ -1578,7 +1611,8 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params //} //printf("\n common_prefix: %zu\n", cur_task.common_prefix); - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float)); + // get the logits of the last token of the common prefix + std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -1730,6 +1764,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } + // TODO: use llama_batch.logits instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index ec9848d8b..fb6ff3eb7 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -27,6 +27,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization mix", }, + { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, @@ -88,13 +89,17 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); + printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); + printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --override-kv KEY=TYPE:VALUE\n"); + printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { @@ -108,14 +113,14 @@ static void usage(const char * executable) { exit(1); } -static void load_imatrix(const std::string& imatrix_file, std::unordered_map>& imatrix_data) { +static void load_imatrix(const std::string & imatrix_file, std::unordered_map> & imatrix_data) { std::ifstream in(imatrix_file.c_str(), std::ios::binary); if (!in) { - printf("%s: failed to open %s\n",__func__,imatrix_file.c_str()); + printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); return; } int n_entries; - in.read((char*)&n_entries, sizeof(n_entries)); + in.read((char *)&n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); return; @@ -125,25 +130,25 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map name_as_vec(len+1); in.read((char *)name_as_vec.data(), len); if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str()); + printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); return; } name_as_vec[len] = 0; std::string name{name_as_vec.data()}; - auto& e = imatrix_data[std::move(name)]; + auto & e = imatrix_data[std::move(name)]; int ncall; - in.read((char*)&ncall, sizeof(ncall)); + in.read((char *)&ncall, sizeof(ncall)); int nval; in.read((char *)&nval, sizeof(nval)); if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n",__func__,i); + printf("%s: failed reading number of values for entry %d\n", __func__, i); imatrix_data = {}; return; } e.resize(nval); - in.read((char*)e.data(), nval*sizeof(float)); + in.read((char *)e.data(), nval*sizeof(float)); if (in.fail()) { - printf("%s: failed reading data for entry %d\n",__func__,i); + printf("%s: failed reading data for entry %d\n", __func__, i); imatrix_data = {}; return; } @@ -151,13 +156,13 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map& included_weights, - const std::vector& excluded_weights, - std::unordered_map>& imatrix_data) { +static void prepare_imatrix(const std::string & imatrix_file, + const std::vector & included_weights, + const std::vector & excluded_weights, + std::unordered_map> & imatrix_data) { if (!imatrix_file.empty()) { load_imatrix(imatrix_file, imatrix_data); } @@ -202,6 +207,43 @@ static ggml_type parse_ggml_type(const char * arg) { return result; } +static bool parse_kv_override(const char * data, std::vector & overrides) { + const char* sep = strchr(data, '='); + if (sep == nullptr || sep - data >= 128) { + fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + return false; + } + llama_model_kv_override kvo; + std::strncpy(kvo.key, data, sep - data); + kvo.key[sep - data] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.int_value = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.float_value = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.bool_value = true; + } else if (std::strcmp(sep, "false") == 0) { + kvo.bool_value = false; + } else { + fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); + return false; + } + } else { + fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + return false; + } + overrides.emplace_back(std::move(kvo)); + return true; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -212,6 +254,7 @@ int main(int argc, char ** argv) { int arg_idx = 1; std::string imatrix_file; std::vector included_weights, excluded_weights; + std::vector kv_overrides; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -228,6 +271,10 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { + if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { params.allow_requantize = true; } else if (strcmp(argv[arg_idx], "--pure") == 0) { @@ -268,6 +315,11 @@ int main(int argc, char ** argv) { if (!imatrix_data.empty()) { params.imatrix = &imatrix_data; } + if (!kv_overrides.empty()) { + kv_overrides.emplace_back(); + kv_overrides.back().key[0] = 0; + params.kv_overrides = &kv_overrides; + } llama_backend_init(); @@ -289,8 +341,7 @@ int main(int argc, char ** argv) { if (ftype_str == "COPY") { params.only_copy = true; } - } - else { + } else { fname_out = argv[arg_idx]; arg_idx++; @@ -321,10 +372,12 @@ int main(int argc, char ** argv) { if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || - params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) { - fprintf(stderr, "\n===============================================================================================\n"); - fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); - fprintf(stderr, "===============================================================================================\n\n\n"); + params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { + fprintf(stderr, "\n==========================================================================================================\n"); + fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); + fprintf(stderr, "==========================================================================================================\n\n\n"); return 1; } diff --git a/examples/server/README.md b/examples/server/README.md index 49121a460..aadc73b4b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -25,9 +25,9 @@ The project is under active development, and we are [looking for feedback and co - `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. -- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`. - `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`. - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 338e60f28..53ad9239e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -99,6 +99,7 @@ struct slot_params { uint32_t seed = -1; // RNG seed int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half int32_t n_predict = -1; // new tokens to predict std::vector antiprompt; @@ -746,7 +747,8 @@ struct server_context { { const int32_t n_batch = llama_n_batch(ctx); - batch = llama_batch_init(n_batch, 0, params.n_parallel); + // only a single seq_id per token is needed + batch = llama_batch_init(n_batch, 0, 1); } metrics.init(); @@ -846,6 +848,7 @@ struct server_context { slot.sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); slot.sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot.params.n_keep = json_value(data, "n_keep", slot.params.n_keep); + slot.params.n_discard = json_value(data, "n_discard", default_params.n_discard); slot.params.seed = json_value(data, "seed", default_params.seed); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); @@ -1253,6 +1256,7 @@ struct server_context { {"stop", slot.params.antiprompt}, {"n_predict", slot.params.n_predict}, // TODO: fix duplicate key n_predict {"n_keep", slot.params.n_keep}, + {"n_discard", slot.params.n_discard}, {"ignore_eos", ignore_eos}, {"stream", slot.params.stream}, {"logit_bias", slot.sparams.logit_bias}, @@ -1696,7 +1700,7 @@ struct server_context { // Shift context const int n_keep = slot.params.n_keep + add_bos_token; const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; - const int n_discard = n_left / 2; + const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); LOG_INFO("slot context shift", { {"id_slot", slot.id}, @@ -2510,15 +2514,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } -#ifndef GGML_USE_CUBLAS - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUBLAS +#ifndef GGML_USE_CUDA + fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; break; } -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL) std::string arg_next = argv[i]; // split string by , and / @@ -2535,17 +2539,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, } } #else - LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); -#endif // GGML_USE_CUBLAS + LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {}); +#endif // GGML_USE_CUDA } else if (arg == "--main-gpu" || arg == "-mg") { if (++i >= argc) { invalid_param = true; break; } -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL) params.main_gpu = std::stoi(argv[i]); #else - LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {}); + LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {}); #endif } else if (arg == "--lora") { if (++i >= argc) { diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 8b31b678a..6e0815b36 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -65,7 +65,6 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - params.logits_all = true; std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); // load the draft model diff --git a/ggml-backend.c b/ggml-backend.c index 6026570ae..402d86ef3 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) { ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); // add forward decls here to avoid including the backend headers -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA extern GGML_CALL void ggml_backend_cuda_reg_devices(void); ggml_backend_cuda_reg_devices(); #endif diff --git a/ggml-common.h b/ggml-common.h index 0257c928c..517c9bb43 100644 --- a/ggml-common.h +++ b/ggml-common.h @@ -377,6 +377,20 @@ typedef struct { } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); +// 1.8125 bpw +typedef struct { + uint8_t qs[QK_K/8]; // grid index, low 8 bits + uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8) + uint8_t scales[QK_K/32]; // 4-bit block scales +} block_iq1_m; +static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding"); + +// Used by IQ1_M quants +typedef union { + ggml_half f16; + uint16_t u16; +} iq1m_scale_t; + // Non-linear quants #define QK4_NL 32 typedef struct { @@ -1050,6 +1064,7 @@ GGML_TABLE_END() #define NGRID_IQ1S 2048 #define IQ1S_DELTA 0.125f +#define IQ1M_DELTA 0.125f #if defined(GGML_COMMON_IMPL_C) GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S) 0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff, diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4f50c9f9f..be8e33a56 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -615,6 +615,7 @@ static int64_t get_row_rounding(ggml_type type, const std::arrayn_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } @@ -2560,7 +2562,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ggml_type a_type = a->type; if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || - a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { + a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; } diff --git a/ggml-cuda/convert.cu b/ggml-cuda/convert.cu index 2516ecddd..18a31edc3 100644 --- a/ggml-cuda/convert.cu +++ b/ggml-cuda/convert.cu @@ -373,7 +373,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); #else - assert(false); + NO_DEVICE_CODE; #endif } @@ -395,7 +395,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); #else - assert(false); + NO_DEVICE_CODE; #endif } @@ -416,7 +416,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_ const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); #else - assert(false); + NO_DEVICE_CODE; #endif } @@ -444,7 +444,7 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } #else - assert(false); + NO_DEVICE_CODE; #endif } @@ -470,7 +470,7 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_ y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); } #else - assert(false); + NO_DEVICE_CODE; #endif } @@ -496,11 +496,42 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_ y[j] = d * (q[j] + delta); } #else - assert(false); + NO_DEVICE_CODE; #endif } +template +static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq1_m * x = (const block_iq1_m *) vx; + + const int tid = threadIdx.x; +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * sc = (const uint16_t *)x[i].scales; + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); + const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); + const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + NO_DEVICE_CODE; +#endif + +} + + template static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -658,6 +689,12 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq4_nl<<>>(vx, y); } +template +static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq1_m<<>>(vx, y); +} + template static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = (k + QK_K - 1) / QK_K; @@ -724,6 +761,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq3_xxs_cuda; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_cuda; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; case GGML_TYPE_IQ4_XS: @@ -769,6 +808,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq3_xxs_cuda; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_cuda; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; case GGML_TYPE_IQ4_XS: diff --git a/ggml-cuda/mmvq.cu b/ggml-cuda/mmvq.cu index 8b2d7a7ff..396559001 100644 --- a/ggml-cuda/mmvq.cu +++ b/ggml-cuda/mmvq.cu @@ -282,6 +282,14 @@ static void mul_mat_vec_iq1_s_q8_1_cuda( (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); } +static void mul_mat_vec_iq1_m_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + static void mul_mat_vec_iq4_nl_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { @@ -373,6 +381,9 @@ void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ1_S: mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; case GGML_TYPE_IQ4_NL: mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; diff --git a/ggml-cuda/vecdotq.cuh b/ggml-cuda/vecdotq.cuh index d911d851d..86b87fa93 100644 --- a/ggml-cuda/vecdotq.cuh +++ b/ggml-cuda/vecdotq.cuh @@ -961,8 +961,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( return d * (sumi1 + sumi2); #endif #else - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif } @@ -1001,13 +1000,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); #else GGML_UNUSED(ksigns64); - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif #else GGML_UNUSED(ksigns64); - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif } @@ -1049,13 +1046,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); #else GGML_UNUSED(ksigns64); - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif #else GGML_UNUSED(ksigns64); - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif } @@ -1085,12 +1080,10 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f; return d * sumi; #else - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif #else - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif } @@ -1119,12 +1112,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds); return d * sumi; #else - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif #else - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif } @@ -1159,8 +1150,50 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const float m = d1q * __high2float(bq8_1[ib32].ds); return d * sumi + m * delta; #else - assert(false); - return 0.f; + NO_DEVICE_CODE; +#endif +} + +static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if QK_K == 256 + const block_iq1_m * bq1 = (const block_iq1_m *) vbq; + + const int ib32 = iqs; + int sumi[2] = {0, 0}; + float sumf[2] = {0.f, 0.f}; +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi[l/2] = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi[l/2])); + const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; + const int sumy = __dp4a(q8[2*l+1], 0x01010101, __dp4a(q8[2*l+0], 0x01010101, 0)); + sumf[l/2] += delta*sumy; + } +#else + const int8_t * q8 = bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + int sumy = 0; + for (int j = 0; j < 4; ++j) { + sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4); + sumy += q8[j] + q8[j+4]; + } + const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; + sumf[l/2] += delta*sumy; + q8 += 8; + } +#endif + iq1m_scale_t scale; + const uint16_t * sc = (const uint16_t *)bq1->scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds); + return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); +#else + NO_DEVICE_CODE; #endif } @@ -1223,27 +1256,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - //// iqs is 0...7 - //const int ib64 = iqs/2; - //const int il = iqs%2; - //const int32_t * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il; - //const int32_t * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il; - //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il; - //const uint32_t * q4_2 = q4_1 + 4; - //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4); - //const int8_t ls2 = (bq4->scales_l[ib64] >> 4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4); - //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds); - //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds); - //int v1, v2; - //int sumi1 = 0, sumi2 = 0; - //for (int j = 0; j < 2; ++j) { - // get_int_from_table_16(q4_1[j], values, v1, v2); - // sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1)); - // get_int_from_table_16(q4_2[j], values, v1, v2); - // sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2)); - //} - //return d1 * sumi1 + d2 * sumi2; - // iqs is 0...7 const int ib32 = iqs; const int32_t * q8 = (const int *)bq8_1[ib32].qs; @@ -1259,24 +1271,8 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( } return d * (sumi1 + sumi2); - //// iqs is 0...15 - //const int ib32 = iqs/2; - //const int il = iqs%2; - //const int32_t * q8 = (const int *)bq8_1[ib32].qs + 2*il; - //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il; - //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); - //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds); - //int v1, v2; - //int sumi1 = 0, sumi2 = 0; - //for (int j = 0; j < 2; ++j) { - // get_int_from_table_16(q4[j], values, v1, v2); - // sumi1 = __dp4a(v1, q8[j+0], sumi1); - // sumi2 = __dp4a(v2, q8[j+4], sumi2); - //} - //return d * (sumi1 + sumi2); #else - assert(false); - return 0.f; + NO_DEVICE_CODE; #endif #else return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs); diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 81dd50678..407062e6f 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1430,6 +1430,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml struct ggml_tensor * dst = gf->nodes[i]; GGML_ASSERT(dst->data != nullptr); + if (ggml_is_empty(dst)) { + continue; + } + switch (dst->op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: diff --git a/ggml-metal.m b/ggml-metal.m index 416b24532..a08abbc29 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -64,6 +64,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, @@ -91,6 +92,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, @@ -114,6 +116,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, @@ -134,6 +137,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, @@ -154,6 +158,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32, @@ -490,6 +495,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M, get_rows_iq1_m, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); @@ -517,6 +523,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); @@ -540,6 +547,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); @@ -560,6 +568,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); @@ -580,6 +589,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); @@ -837,6 +847,10 @@ static enum ggml_status ggml_metal_graph_compute( struct ggml_tensor * src2 = gf->nodes[i]->src[2]; struct ggml_tensor * dst = gf->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + switch (dst->op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -1421,6 +1435,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break; case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); @@ -1575,6 +1590,12 @@ static enum ggml_status ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; } break; + case GGML_TYPE_IQ1_M: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline; + } break; case GGML_TYPE_IQ4_NL: { nth0 = 4; @@ -1619,9 +1640,9 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setBytes:&r2 length:sizeof(r2) atIndex:17]; [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || - src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || - src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) { + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { @@ -1743,6 +1764,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); @@ -1900,6 +1922,12 @@ static enum ggml_status ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; } break; + case GGML_TYPE_IQ1_M: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline; + } break; case GGML_TYPE_IQ4_NL: { nth0 = 4; @@ -1960,9 +1988,9 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; } - if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || - src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || - src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) { + if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || src2t == GGML_TYPE_Q5_0 || + src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || src2t == GGML_TYPE_Q2_K || + src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ1_M || src2t == GGML_TYPE_IQ2_S) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) { @@ -2024,6 +2052,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break; case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 748f0acef..e8083734c 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -4456,6 +4456,104 @@ void kernel_mul_mv_iq1_s_f32_impl( } } +void kernel_mul_mv_iq1_m_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_iq1_m * x = (device const block_iq1_m *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + iq1m_scale_t scale; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + float4 sumy = {0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; + yl[i+ 8] = y4[i+ 8]; sumy[1] += yl[i+ 8]; + yl[i+16] = y4[i+16]; sumy[2] += yl[i+16]; + yl[i+24] = y4[i+24]; sumy[3] += yl[i+24]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq1_m * xr = x + ibl; + device const uint8_t * qs = xr->qs + 4 * ib; + device const uint8_t * qh = xr->qh + 2 * ib; + device const uint16_t * sc = (device const uint16_t *)xr->scales; + + for (int row = 0; row < N_DST; row++) { + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); + constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[1] << 8) & 0x700))); + constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[1] << 4) & 0x700))); + + float2 sum = {0.f}; + for (int j = 0; j < 4; ++j) { + sum[0] += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4) + + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4); + sum[1] += yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4) + + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4); + } + const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) + + (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1)); + + sc += nb*sizeof(block_iq1_m)/2; + qs += nb*sizeof(block_iq1_m); + qh += nb*sizeof(block_iq1_m); + } + + y4 += 32 * 32; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; + } + } +} + void kernel_mul_mv_iq4_nl_f32_impl( device const void * src0, device const float * src1, @@ -4673,6 +4771,34 @@ kernel void kernel_mul_mv_iq1_s_f32( kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); } +[[host_name("kernel_mul_mv_iq1_m_f32")]] +kernel void kernel_mul_mv_iq1_m_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); +} + [[host_name("kernel_mul_mv_iq4_nl_f32")]] kernel void kernel_mul_mv_iq4_nl_f32( device const void * src0, @@ -5146,6 +5272,30 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & } } +template +void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + iq1m_scale_t scale; + device const uint16_t * sc = (device const uint16_t *)xb->scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = scale.f16; + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint8_t * qh = xb->qh + 2*ib32 + il; + const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); + const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) + ml1; + reg[1][i] = dl * (grid1[i] >> 4) + ml1; + reg[2][i] = dl * (grid2[i] & 0xf) + ml2; + reg[3][i] = dl * (grid2[i] >> 4) + ml2; + } +} + template void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { device const uint16_t * q4 = (device const uint16_t *)xb->qs; @@ -5730,6 +5880,7 @@ template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_r template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; #if QK_K == 64 template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows; @@ -5778,6 +5929,7 @@ template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_m template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; #if QK_K == 64 template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -5838,6 +5990,7 @@ template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; #if QK_K == 64 template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; @@ -7005,6 +7158,69 @@ kernel void kernel_mul_mv_id_iq1_s_f32( sgitg); } +[[host_name("kernel_mul_mv_id_iq1_m_f32")]] +kernel void kernel_mul_mv_id_iq1_m_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq1_m_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + tgpig, + tiisg, + sgitg); +} + [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel void kernel_mul_mv_id_iq4_nl_f32( device const char * ids, diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index aa73d67df..b3f8b7eaf 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2234,6 +2234,11 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; + + if (ggml_is_empty(node)) { + continue; + } + switch (node->op) { case GGML_OP_MUL_MAT: ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0); diff --git a/ggml-quants.c b/ggml-quants.c index f26798acc..f717e616e 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3474,6 +3474,54 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in } } +void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + float delta[4]; + uint16_t idx[4]; + + iq1m_scale_t scale; + + for (int i = 0; i < nb; i++) { + + const uint16_t * sc = (const uint16_t *)x[i].scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = GGML_FP16_TO_FP32(scale.f16); + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + + for (int ib = 0; ib < QK_K/32; ++ib) { + const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1); + const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1); + idx[0] = qs[0] | ((qh[0] << 8) & 0x700); + idx[1] = qs[1] | ((qh[0] << 4) & 0x700); + idx[2] = qs[2] | ((qh[1] << 8) & 0x700); + idx[3] = qs[3] | ((qh[1] << 4) & 0x700); + delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA; + delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA; + delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA; + delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA; + for (int l = 0; l < 2; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + for (int j = 0; j < 8; ++j) { + y[j] = dl1 * (grid[j] + delta[l]); + } + y += 8; + } + for (int l = 2; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + for (int j = 0; j < 8; ++j) { + y[j] = dl2 * (grid[j] + delta[l]); + } + y += 8; + } + qs += 4; + qh += 2; + } + } +} + static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) { @@ -9695,6 +9743,206 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void #endif } +void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_m * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + + iq1m_scale_t scale; + +#if defined __ARM_NEON + + const int32x4_t mask = vdupq_n_s32(0x7); + const int32x4_t mone = vdupq_n_s32(1); + const int32x4_t mzero = vdupq_n_s32(0); + + ggml_int8x16x4_t deltas; + deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1)); + deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1)); + deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1)); + deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1)); + + ggml_int8x16x4_t q1b; + ggml_int8x16x4_t q8b; + + uint32_t aux32; + const uint8_t * aux8 = (const uint8_t *)&aux32; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + int32x4_t sumi1 = mzero; + int32x4_t sumi2 = mzero; + + for (int ib = 0; ib < QK_K/32; ib += 2) { + + q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700))))); + q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700))))); + q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700))))); + q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700))))); + + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + + const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1])); + const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3])); + const int32x4_t p12 = vpaddq_s32(p1, p2); + + const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that + aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202); + + const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1])); + const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3])); + const int32x4_t p34 = vpaddq_s32(p3, p4); + + int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9); + scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone); + + sumi1 = vmlaq_s32(sumi1, scales_4, p12); + sumi2 = vmlaq_s32(sumi2, scales_4, p34); + + qs += 8; qh += 4; + + } + + sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); + } + + *s = sumf; + +#elif defined __AVX2__ + + const __m256i mask = _mm256_set1_epi16(0x7); + const __m256i mone = _mm256_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m256i q1b_1 = _mm256_set_epi64x( + iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)], + iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)] + ); + const __m256i q1b_2 = _mm256_set_epi64x( + iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)], + iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)] + ); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + + const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); + const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); + + const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, + qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, + qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + + const __m256i dot3 = mul_add_epi8(delta1, q8b_1); + const __m256i dot4 = mul_add_epi8(delta2, q8b_2); + __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0)); + __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6)); + scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone); + scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone); + const __m256i p1 = _mm256_madd_epi16(dot1, scale1); + const __m256i p2 = _mm256_madd_epi16(dot2, scale2); + const __m256i p3 = _mm256_madd_epi16(dot3, scale1); + const __m256i p4 = _mm256_madd_epi16(dot4, scale2); + + sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4)); + + qs += 8; qh += 4; + } + + const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); + accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); + accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); + + } + + *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); + +#else + + int sum1[2], sum2[2], delta[4]; + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + + int sumi1 = 0, sumi2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + delta[0] = qh[0] & 0x08 ? -1 : 1; + delta[1] = qh[0] & 0x80 ? -1 : 1; + delta[2] = qh[1] & 0x08 ? -1 : 1; + delta[3] = qh[1] & 0x80 ? -1 : 1; + sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700))); + int lsum1 = 0, lsum2 = 0; + for (int j = 0; j < 8; ++j) { + lsum1 += q8[j] * grid[j]; + lsum2 += q8[j]; + } + q8 += 8; + sum1[l/2] += lsum1; + sum2[l/2] += lsum2*delta[l]; + } + const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1; + const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1; + sumi1 += sum1[0] * ls1 + sum1[1] * ls2; + sumi2 += sum2[0] * ls1 + sum2[1] * ls2; + qs += 4; + qh += 2; + } + + sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + } + + *s = sumf; + +#endif +} + void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -9938,17 +10186,17 @@ static iq2_entry_t iq2_data[4] = { }; static inline int iq2_data_index(enum ggml_type type) { - GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S); + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S); return type == GGML_TYPE_IQ2_XXS ? 0 : type == GGML_TYPE_IQ2_XS ? 1 : - type == GGML_TYPE_IQ1_S ? 2 : 3; + type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3; } static inline int iq2_grid_size(enum ggml_type type) { - GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S); + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S); return type == GGML_TYPE_IQ2_XXS ? 256 : type == GGML_TYPE_IQ2_XS ? 512 : - type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024; + type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024; } static int iq2_compare_func(const void * left, const void * right) { @@ -10214,10 +10462,10 @@ void iq2xs_init_impl(enum ggml_type type) { const int kmap_size = 43692; //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2; - const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2; + const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2; const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 : type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : - type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024; + type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024; uint64_t * kgrid_q2xs; int * kmap_q2xs; uint16_t * kneighbors_q2xs; @@ -10314,7 +10562,7 @@ void iq2xs_init_impl(enum ggml_type type) { } void iq2xs_free_impl(enum ggml_type type) { - GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S); + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S); const int gindex = iq2_data_index(type); if (iq2_data[gindex].grid) { free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL; @@ -11520,7 +11768,16 @@ static int iq1_sort_helper(const void * left, const void * right) { } #define IQ1S_BLOCK_SIZE 32 -static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { +#define IQ1M_BLOCK_SIZE 16 +static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights, + float * scales, + float * weight, + float * sumx, + float * sumw, + float * pairs, + int8_t * L, + uint16_t * index, + int8_t * shifts) { const int gindex = iq2_data_index(GGML_TYPE_IQ1_S); @@ -11534,22 +11791,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(n%QK_K == 0); + block_iq1_s * y = vy; + const int nbl = n/QK_K; - block_iq1_s * y = vy; + const int block_size = IQ1S_BLOCK_SIZE; const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA}; const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA}; - float scales[QK_K/IQ1S_BLOCK_SIZE]; - float weight[IQ1S_BLOCK_SIZE]; - int8_t L[IQ1S_BLOCK_SIZE]; - float sumx[IQ1S_BLOCK_SIZE+1]; - float sumw[IQ1S_BLOCK_SIZE+1]; - float pairs[2*IQ1S_BLOCK_SIZE]; + int * idx = (int *)(pairs + 1); - uint16_t index[IQ1S_BLOCK_SIZE/8]; - int8_t shifts[QK_K/IQ1S_BLOCK_SIZE]; for (int ibl = 0; ibl < nbl; ++ibl) { @@ -11564,15 +11816,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; float sigma2 = 2*sumx2/QK_K; - for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) { - const float * xb = xbl + IQ1S_BLOCK_SIZE*ib; - const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib; - for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + for (int ib = 0; ib < QK_K/block_size; ++ib) { + const float * xb = xbl + block_size*ib; + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); float max = fabsf(xb[0]); - for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i])); + for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); if (!max) { scales[ib] = 0; - memset(L, 1, IQ1S_BLOCK_SIZE); + memset(L, 1, block_size); continue; } // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. @@ -11581,14 +11833,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale // for each possible and score for each split. - for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) { + for (int j = 0; j < block_size; ++j) { pairs[2*j] = xb[j]; idx[2*j] = j; } - qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper); + qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); { sumx[0] = sumw[0] = 0; - for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) { + for (int j = 0; j < block_size; ++j) { int i = idx[2*j]; sumx[j+1] = sumx[j] + weight[i]*xb[i]; sumw[j+1] = sumw[j] + weight[i]; @@ -11596,16 +11848,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } float best_score = 0, scale = max; int besti1 = -1, besti2 = -1, best_shift = 0; - for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) { - for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) { - float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2]; - float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2]; + for (int i1 = 0; i1 <= block_size; ++i1) { + for (int i2 = i1; i2 <= block_size; ++i2) { + float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; + float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { scale = sumqx/sumq2; best_score = scale*sumqx; besti1 = i1; besti2 = i2; best_shift = 1; } - sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2]; - sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2]; + sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; + sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { scale = sumqx/sumq2; best_score = scale*sumqx; besti1 = i1; besti2 = i2; best_shift = -1; @@ -11615,14 +11867,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; - for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2; + for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; if (scale < 0) { - for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j]; + for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; scale = -scale; best_shift = -best_shift; } bool all_on_grid = true; const float * xx = best_shift == 1 ? x_p : x_m; - for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { + for (int k = 0; k < block_size/8; ++k) { uint16_t u = 0; for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); int grid_index = kmap_q2xs[u]; @@ -11636,7 +11888,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } if (!all_on_grid) { float sumqx = 0, sumq2 = 0; - for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { + for (int k = 0; k < block_size/8; ++k) { const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); for (int j = 0; j < 8; ++j) { float w = weight[8*k + j]; @@ -11648,8 +11900,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; } uint16_t h = 0; - for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { - y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255; + for (int k = 0; k < block_size/8; ++k) { + y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255; h |= (index[k] >> 8) << 3*k; } y[ibl].qh[ib] = h; @@ -11660,14 +11912,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } if (!max_scale) { - memset(y[ibl].qs, 0, QK_K/8); continue; } float d = max_scale/15; - y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed. + y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed. float id = 1/d; - for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) { + for (int ib = 0; ib < QK_K/block_size; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); l = MAX(0, MIN(7, l)); if (shifts[ib] == -1) l |= 8; @@ -11678,16 +11929,292 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); + float scales[QK_K/IQ1S_BLOCK_SIZE]; + float weight[IQ1S_BLOCK_SIZE]; + int8_t L[IQ1S_BLOCK_SIZE]; + float sumx[IQ1S_BLOCK_SIZE+1]; + float sumw[IQ1S_BLOCK_SIZE+1]; + float pairs[2*IQ1S_BLOCK_SIZE]; + uint16_t index[IQ1S_BLOCK_SIZE/8]; + int8_t shifts[QK_K/IQ1S_BLOCK_SIZE]; int nblock = n_per_row/QK_K; char * qrow = (char *)dst; for (int row = 0; row < nrow; ++row) { - quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights); + quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts); src += n_per_row; qrow += nblock*sizeof(block_iq1_s); } return nrow * nblock * sizeof(block_iq1_s); } +static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights, + float * scales, + float * weight, + float * pairs, + int8_t * L, + uint16_t * index, + int8_t * shifts) { + + const int gindex = iq2_data_index(GGML_TYPE_IQ1_M); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + //GGML_ASSERT(quant_weights && "missing quantization weights"); + GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(n%QK_K == 0); + + block_iq1_m * y = vy; + + const int nbl = n/QK_K; + + const int block_size = IQ1M_BLOCK_SIZE; + + const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA}; + const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA}; + const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88}; + + int * idx = (int *)(pairs + 1); + + float sumqx[4], sumq2[4]; + + iq1m_scale_t s; + const float * xx; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + //y[ibl].d = GGML_FP32_TO_FP16(0.f); + memset(y[ibl].qs, 0, QK_K/8); + memset(y[ibl].qh, 0, QK_K/16); + memset(y[ibl].scales, 0, QK_K/32); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = 2*sumx2/QK_K; + + for (int ib = 0; ib < QK_K/block_size; ++ib) { + const float * xb = xbl + block_size*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i]; + } + float max = fabsf(xb[0]); + for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); + if (!max) { + scales[ib] = 0; + memset(L, 1, block_size); + continue; + } + // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. + // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two + // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights + // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and + // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale + // for each possible and score for each split. + for (int j = 0; j < block_size; ++j) { + pairs[2*j] = xb[j]; + idx[2*j] = j; + } + qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); + float best_score = 0, scale = max; + int besti1 = -1, besti2 = -1, best_k = -1; + // 0: +, + + // 1: +, - + // 2: -, + + // 3: -, - + for (int i1 = 0; i1 <= block_size; ++i1) { + for (int i2 = i1; i2 <= block_size; ++i2) { + memset(sumqx, 0, 4*sizeof(float)); + memset(sumq2, 0, 4*sizeof(float)); + for (int j = 0; j < i1; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[0]*xb[i]; + sumqx[1] += weight[i]*x_p[0]*xb[i]; + sumqx[2] += weight[i]*x_m[0]*xb[i]; + sumqx[3] += weight[i]*x_m[0]*xb[i]; + sumq2[0] += weight[i]*x_p[0]*x_p[0]; + sumq2[1] += weight[i]*x_p[0]*x_p[0]; + sumq2[2] += weight[i]*x_m[0]*x_m[0]; + sumq2[3] += weight[i]*x_m[0]*x_m[0]; + } else { + sumqx[0] += weight[i]*x_p[0]*xb[i]; + sumqx[2] += weight[i]*x_p[0]*xb[i]; + sumqx[1] += weight[i]*x_m[0]*xb[i]; + sumqx[3] += weight[i]*x_m[0]*xb[i]; + sumq2[0] += weight[i]*x_p[0]*x_p[0]; + sumq2[2] += weight[i]*x_p[0]*x_p[0]; + sumq2[1] += weight[i]*x_m[0]*x_m[0]; + sumq2[3] += weight[i]*x_m[0]*x_m[0]; + } + } + for (int j = i1; j < i2; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[1]*xb[i]; + sumqx[1] += weight[i]*x_p[1]*xb[i]; + sumqx[2] += weight[i]*x_m[1]*xb[i]; + sumqx[3] += weight[i]*x_m[1]*xb[i]; + sumq2[0] += weight[i]*x_p[1]*x_p[1]; + sumq2[1] += weight[i]*x_p[1]*x_p[1]; + sumq2[2] += weight[i]*x_m[1]*x_m[1]; + sumq2[3] += weight[i]*x_m[1]*x_m[1]; + } else { + sumqx[0] += weight[i]*x_p[1]*xb[i]; + sumqx[2] += weight[i]*x_p[1]*xb[i]; + sumqx[1] += weight[i]*x_m[1]*xb[i]; + sumqx[3] += weight[i]*x_m[1]*xb[i]; + sumq2[0] += weight[i]*x_p[1]*x_p[1]; + sumq2[2] += weight[i]*x_p[1]*x_p[1]; + sumq2[1] += weight[i]*x_m[1]*x_m[1]; + sumq2[3] += weight[i]*x_m[1]*x_m[1]; + } + } + for (int j = i2; j < block_size; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[2]*xb[i]; + sumqx[1] += weight[i]*x_p[2]*xb[i]; + sumqx[2] += weight[i]*x_m[2]*xb[i]; + sumqx[3] += weight[i]*x_m[2]*xb[i]; + sumq2[0] += weight[i]*x_p[2]*x_p[2]; + sumq2[1] += weight[i]*x_p[2]*x_p[2]; + sumq2[2] += weight[i]*x_m[2]*x_m[2]; + sumq2[3] += weight[i]*x_m[2]*x_m[2]; + } else { + sumqx[0] += weight[i]*x_p[2]*xb[i]; + sumqx[2] += weight[i]*x_p[2]*xb[i]; + sumqx[1] += weight[i]*x_m[2]*xb[i]; + sumqx[3] += weight[i]*x_m[2]*xb[i]; + sumq2[0] += weight[i]*x_p[2]*x_p[2]; + sumq2[2] += weight[i]*x_p[2]*x_p[2]; + sumq2[1] += weight[i]*x_m[2]*x_m[2]; + sumq2[3] += weight[i]*x_m[2]*x_m[2]; + } + } + for (int k = 0; k < 4; ++k) { + if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) { + scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k]; + besti1 = i1; besti2 = i2; best_k = k; + } + } + } + } + GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0); + for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; + for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; + for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; + if (scale < 0) { + for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; + scale = -scale; + best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0; + } + bool all_on_grid = true; + for (int k = 0; k < block_size/8; ++k) { + if (k == 0) xx = best_k < 2 ? x_p : x_m; + else xx = best_k%2 == 0 ? x_p : x_m; + uint16_t u = 0; + for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + all_on_grid = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); + GGML_ASSERT(grid_index >= 0); + } + index[k] = grid_index; + } + if (!all_on_grid) { + float sumqx_f = 0, sumq2_f = 0; + for (int k = 0; k < block_size/8; ++k) { + if (k == 0) xx = best_k < 2 ? x_p : x_m; + else xx = best_k%2 == 0 ? x_p : x_m; + const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); + for (int j = 0; j < 8; ++j) { + float w = weight[8*k + j]; + float q = xx[(pg[j] - 1)/2]; + sumqx_f += w*q*xb[8*k+j]; + sumq2_f += w*q*q; + } + } + if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f; + } + y[ibl].qs[2*ib + 0] = index[0] & 255; + y[ibl].qs[2*ib + 1] = index[1] & 255; + y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4); + GGML_ASSERT(scale >= 0); + scales[ib] = scale; + shifts[ib] = best_k; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + continue; + } + + uint16_t * sc = (uint16_t *)y[ibl].scales; + float d = max_scale/15; + float id = 1/d; + float sumqx_f = 0, sumq2_f = 0; + for (int ib = 0; ib < QK_K/block_size; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib+0]-1)); + l = MAX(0, MIN(7, l)); + sc[ib/4] |= (l << 3*(ib%4)); + y[ibl].qh[ib] |= masks[shifts[ib]]; + const float * xb = xbl + block_size*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i]; + } + for (int k = 0; k < block_size/8; ++k) { + if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m; + else xx = shifts[ib]%2 == 0 ? x_p : x_m; + const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700)); + for (int j = 0; j < 8; ++j) { + float w = weight[8*k + j]; + float q = xx[(pg[j] - 1)/2]*(2*l+1); + sumqx_f += w*q*xb[8*k+j]; + sumq2_f += w*q*q; + } + } + } + if (sumq2_f > 0) d = sumqx_f/sumq2_f; + s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed. + sc[0] |= ((s.u16 & 0x000f) << 12); + sc[1] |= ((s.u16 & 0x00f0) << 8); + sc[2] |= ((s.u16 & 0x0f00) << 4); + sc[3] |= ((s.u16 & 0xf000) << 0); + } +} + +size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row%QK_K == 0); + float scales[QK_K/IQ1M_BLOCK_SIZE]; + float weight[IQ1M_BLOCK_SIZE]; + int8_t L[IQ1M_BLOCK_SIZE]; + float pairs[2*IQ1M_BLOCK_SIZE]; + uint16_t index[IQ1M_BLOCK_SIZE/8]; + int8_t shifts[QK_K/IQ1M_BLOCK_SIZE]; + int nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts); + src += n_per_row; + qrow += nblock*sizeof(block_iq1_m); + } + return nrow * nblock * sizeof(block_iq1_m); +} + // ============================ 4-bit non-linear quants static inline int best_index_int8(int n, const int8_t * val, float x) { diff --git a/ggml-quants.h b/ggml-quants.h index aa7e54a16..ac1091c3d 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -72,6 +72,7 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); @@ -94,6 +95,7 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -104,6 +106,7 @@ size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT ds size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index fc4d2964c..789ba97bf 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -16973,7 +16973,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } #ifndef NDEBUG diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index cbceaa19f..521a1314b 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5566,7 +5566,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } diff --git a/ggml.c b/ggml.c index 203a9e540..eb469d0f7 100644 --- a/ggml.c +++ b/ggml.c @@ -794,6 +794,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ1_M] = { + .type_name = "iq1_m", + .blck_size = QK_K, + .type_size = sizeof(block_iq1_m), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_m, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = ggml_vec_dot_iq1_m_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", .blck_size = QK4_NL, @@ -2539,6 +2551,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break; + case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break; case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break; case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; @@ -2594,6 +2607,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } +GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) { + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + if (tensor->ne[i] == 0) { + // empty if any dimension has no elements + return true; + } + } + return false; +} + bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -2608,7 +2631,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return + return ggml_is_empty(t0) ? ggml_is_empty(t1) : (t1->ne[0]%t0->ne[0] == 0) && (t1->ne[1]%t0->ne[1] == 0) && (t1->ne[2]%t0->ne[2] == 0) && @@ -8135,6 +8158,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -8417,6 +8441,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -8544,6 +8569,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -11447,6 +11473,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -11638,6 +11665,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -11861,6 +11889,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -12564,6 +12593,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -12652,6 +12682,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -16093,7 +16124,7 @@ static void ggml_compute_forward_cross_entropy_loss_back( static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); - if (tensor->op == GGML_OP_NONE) { + if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { return; } @@ -17962,6 +17993,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) { int n_tasks = 0; + if (ggml_is_empty(node)) { + // no need to multi-thread a no-op + n_tasks = 1; + return n_tasks; + } + switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: @@ -20306,7 +20343,8 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break; + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; default: // nothing @@ -20331,7 +20369,8 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) { return type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || - type == GGML_TYPE_IQ1_S; + type == GGML_TYPE_IQ1_S;// || + //type == GGML_TYPE_IQ1_M; } size_t ggml_quantize_chunk( @@ -20375,6 +20414,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; #if QK_K == 64 case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; @@ -21674,15 +21714,15 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) return 1; #else return 0; #endif } -int ggml_cpu_has_cublas(void) { -#if defined(GGML_USE_CUBLAS) +int ggml_cpu_has_cuda(void) { +#if defined(GGML_USE_CUDA) return 1; #else return 0; @@ -21722,7 +21762,7 @@ int ggml_cpu_has_sycl(void) { } int ggml_cpu_has_gpublas(void) { - return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || + return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl(); } diff --git a/ggml.h b/ggml.h index 0a5af7205..5d4a4ceb6 100644 --- a/ggml.h +++ b/ggml.h @@ -369,6 +369,7 @@ extern "C" { GGML_TYPE_I32 = 26, GGML_TYPE_I64 = 27, GGML_TYPE_F64 = 28, + GGML_TYPE_IQ1_M = 29, GGML_TYPE_COUNT, }; @@ -408,6 +409,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors }; // available tensor operations: @@ -748,6 +750,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor); GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); @@ -2354,7 +2357,7 @@ extern "C" { GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_blas (void); - GGML_API int ggml_cpu_has_cublas (void); + GGML_API int ggml_cpu_has_cuda (void); GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_kompute (void); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e47896e2a..4ab026482 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -706,6 +706,7 @@ class GGMLQuantizationType(IntEnum): I32 = 26 I64 = 27 F64 = 28 + IQ1_M = 29 class GGUFEndian(IntEnum): diff --git a/llama.cpp b/llama.cpp index c8fa46496..30dae82ab 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7,7 +7,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA # include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) # include "ggml-opencl.h" @@ -1505,7 +1505,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) // host buffers should only be used when data is expected to be copied to/from the GPU if (host_buffer) { buft = ggml_backend_cuda_host_buffer_type(); @@ -1535,7 +1535,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { #ifdef GGML_USE_METAL buft = ggml_backend_metal_buffer_type(); -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) buft = ggml_backend_cuda_buffer_type(gpu); #elif defined(GGML_USE_VULKAN) buft = ggml_backend_vk_buffer_type(gpu); @@ -1561,7 +1561,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA if (ggml_backend_cuda_get_device_count() > 1) { buft = ggml_backend_cuda_split_buffer_type(tensor_split); } @@ -1582,7 +1582,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g } static size_t llama_get_device_count() { -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) return ggml_backend_cuda_get_device_count(); #elif defined(GGML_USE_SYCL) return ggml_backend_sycl_get_device_count(); @@ -1594,7 +1594,7 @@ static size_t llama_get_device_count() { } static size_t llama_get_device_memory(int device) { -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) size_t total; size_t free; ggml_backend_cuda_get_device_memory(device, &total, &free); @@ -1777,6 +1777,7 @@ struct llama_cparams { uint32_t n_ctx; // context size used during inference uint32_t n_batch; uint32_t n_ubatch; + uint32_t n_seq_max; uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing @@ -2080,7 +2081,7 @@ struct llama_model { ggml_free(ctx); } for (ggml_backend_buffer_t buf : bufs) { -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); } @@ -2139,20 +2140,20 @@ struct llama_context { // host buffer for the model output (logits and embeddings) ggml_backend_buffer_t buf_output = nullptr; - // decode output (2-dimensional array: [n_tokens][n_vocab]) - size_t logits_size = 0; - float * logits = nullptr; + // decode output (2-dimensional array: [n_outputs][n_vocab]) + size_t logits_size = 0; // capacity (of floats) for logits + float * logits = nullptr; + + std::vector output_ids; // map batch token positions to ids of the logits and embd buffers + size_t output_size = 0; // capacity (of tokens positions) for the output buffers + int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch -#ifndef NDEBUG - // guard against access to unset logits - std::vector logits_valid; -#endif bool logits_all = false; - // embeddings output (2-dimensional array: [n_tokens][n_embd]) + // embeddings output (2-dimensional array: [n_outputs][n_embd]) // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE - size_t embd_size = 0; - float * embd = nullptr; + size_t embd_size = 0; // capacity (of floats) for embeddings + float * embd = nullptr; // sequence embeddings output (map of [n_embd] vectors) // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE @@ -2169,14 +2170,15 @@ struct llama_context { struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] + struct ggml_tensor * inp_out_ids; // I32 [n_outputs] struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_pos; // F32 [kv_size] + struct ggml_tensor * inp_KQ_pos; // F32 [n_kv] struct ggml_tensor * inp_K_shift; // I32 [kv_size] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] struct ggml_tensor * inp_s_copy; // I32 [kv_size] - struct ggml_tensor * inp_s_mask; // F32 [1, kv_size] - struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] + struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] + struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] // control vectors struct llama_control_vector cvec; @@ -3018,6 +3020,7 @@ struct llama_model_loader { case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; @@ -3412,8 +3415,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ1_XS :return "IQ1_S mix - 1.6-1.7 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_XS: return "IQ1_S mix - 1.6-1.7 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; @@ -5270,7 +5274,7 @@ static bool llm_load_tensors( } model.bufs.push_back(buf); bufs.emplace(idx, buf); -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA if (n_layer >= n_gpu_layers) { ggml_backend_cuda_register_host_buffer( ggml_backend_buffer_get_base(buf), @@ -5845,7 +5849,8 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) + const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) + const int32_t n_outputs; const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_orig_ctx; @@ -5892,6 +5897,7 @@ struct llm_build_context { norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (batch.n_tokens), n_kv (worst_case ? kv_self.size : kv_self.n), + n_outputs (worst_case ? n_tokens : lctx.n_outputs), kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), pooling_type (cparams.pooling_type), @@ -5913,6 +5919,7 @@ struct llm_build_context { lctx.inp_tokens = nullptr; lctx.inp_embd = nullptr; lctx.inp_pos = nullptr; + lctx.inp_out_ids = nullptr; lctx.inp_KQ_mask = nullptr; lctx.inp_KQ_pos = nullptr; lctx.inp_K_shift = nullptr; @@ -6036,6 +6043,13 @@ struct llm_build_context { return lctx.inp_pos; } + struct ggml_tensor * build_inp_out_ids() { + lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + cb(lctx.inp_out_ids, "inp_out_ids", -1); + ggml_set_input(lctx.inp_out_ids); + return lctx.inp_out_ids; + } + struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { if (causal) { lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens); @@ -6092,6 +6106,9 @@ struct llm_build_context { struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6159,6 +6176,14 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -6338,6 +6363,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -6453,6 +6485,14 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids); + } + struct ggml_tensor * ffn_inp = cur; // feed forward @@ -6496,6 +6536,9 @@ struct llm_build_context { struct ggml_cgraph * build_grok() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -6567,6 +6610,14 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + // Grok // if attn_out_norm is present then apply it before adding the input if (model.layers[il].attn_out_norm) { @@ -6744,6 +6795,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // add the input struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -6941,6 +6999,13 @@ struct llm_build_context { Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); cb(ffn_inp, "ffn_inp", il); @@ -7030,6 +7095,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -7187,6 +7259,13 @@ struct llm_build_context { } cb(cur, "kqv_out", il); + if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // re-add the layer input cur = ggml_add(ctx0, cur, inpL); @@ -7309,6 +7388,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // Add the input struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -7407,6 +7493,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // Add the input struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -7520,6 +7613,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -7626,6 +7726,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -7738,6 +7845,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -7856,6 +7970,14 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids); + } + // FF { ffn_output = llm_build_ffn(ctx0, attn_norm_output, @@ -7953,6 +8075,14 @@ struct llm_build_context { cur = attention_norm; + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // feed-forward network { cur = llm_build_ffn(ctx0, cur, @@ -8045,6 +8175,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // add the input struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -8145,6 +8282,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // add the input struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -8254,6 +8398,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -8364,6 +8515,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -8487,6 +8645,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + // scale_res - scale the hidden states for residual connection const float scale_res = scale_depth/sqrtf(float(n_layer)); cur = ggml_scale(ctx0, cur, scale_res); @@ -8601,6 +8766,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); cb(sa_out, "sa_out", il); @@ -8713,6 +8885,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -8860,6 +9039,15 @@ struct llm_build_context { struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + x = ggml_get_rows(ctx0, x, inp_out_ids); + y = ggml_get_rows(ctx0, y, inp_out_ids); + z = ggml_get_rows(ctx0, z, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens} y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); y = ggml_mul(ctx0, y, ggml_silu(ctx0, z)); @@ -8962,6 +9150,13 @@ struct llm_build_context { Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + struct ggml_tensor * attn_out = cur; // feed-forward network @@ -9259,9 +9454,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); } + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); + const int64_t n_tokens = batch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); + int32_t * data = (int32_t *) lctx.inp_out_ids->data; + + if (lctx.n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (batch.logits) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (batch.logits[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(lctx.n_outputs == n_outputs); + } else if (lctx.n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(lctx.n_outputs == 0); + } + } + GGML_ASSERT( + // (!a || b) is a logical implication (a -> b) + // !hparams.causal_attn -> !cparams.causal_attn (hparams.causal_attn || !cparams.causal_attn) && - "non-causal attention with generative models is not supported" + "causal attention with embedding models is not supported" ); if (lctx.inp_KQ_mask) { @@ -9440,6 +9665,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } +// Make sure enough space is available for outputs. +// Returns max number of outputs for which space was reserved. +static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + + const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = hparams.n_vocab; + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = cparams.causal_attn; + const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; + const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (lctx.output_ids.empty()) { + // init, never resized afterwards + lctx.output_ids.resize(n_batch); + } + + const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!lctx.buf_output || prev_size < new_size) { + if (lctx.buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + ggml_backend_buffer_free(lctx.buf_output); + lctx.buf_output = nullptr; + lctx.logits = nullptr; + lctx.embd = nullptr; + } + + lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size); + if (lctx.buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; + } + } + + float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output); + + lctx.logits = has_logits ? output_base : nullptr; + lctx.embd = has_embd ? output_base + logits_size : nullptr; + + lctx.output_size = n_outputs_max; + lctx.logits_size = logits_size; + lctx.embd_size = embd_size; + + // set all ids as invalid (negative) + std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); + + ggml_backend_buffer_clear(lctx.buf_output, 0); + + lctx.n_outputs = 0; + + return n_outputs_max; +} + + static void llama_graph_compute( llama_context & lctx, ggml_cgraph * gf, @@ -9515,16 +9808,8 @@ static int llama_decode_internal( const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; - - auto * logits_out = lctx.logits; - -#ifndef NDEBUG - auto & logits_valid = lctx.logits_valid; - logits_valid.clear(); - logits_valid.resize(n_tokens_all); - - memset(logits_out, 0, lctx.logits_size*sizeof(float)); -#endif + uint32_t n_outputs = 0; + uint32_t n_outputs_prev = 0; const auto n_ubatch = cparams.n_ubatch; @@ -9533,6 +9818,38 @@ static int llama_decode_internal( std::vector seq_id_arr; std::vector> seq_id; + // count outputs + if (batch_all.logits) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + n_outputs += batch_all.logits[i] != 0; + } + } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) { + n_outputs = n_tokens_all; + } else { + // keep last output only + n_outputs = 1; + } + + // reserve output buffer + if (llama_output_reserve(lctx, n_outputs) < n_outputs) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); + return -2; + }; + + // set output mappings + if (batch_all.logits) { + int32_t i_logits = 0; + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.logits[i]) { + lctx.output_ids[i] = i_logits++; + } + } + } else { + for (uint32_t i = 0; i < n_outputs; ++i) { + lctx.output_ids[i] = i; + } + } + for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) { const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token); llama_batch u_batch = { @@ -9548,6 +9865,27 @@ static int llama_decode_internal( /* .all_seq_id = */ batch_all.all_seq_id, }; + // count the outputs in this u_batch + { + int32_t n_outputs_new = 0; + + if (u_batch.logits) { + for (uint32_t i = 0; i < n_tokens; i++) { + n_outputs_new += u_batch.logits[i] != 0; + } + } else if (n_outputs == n_tokens_all) { + n_outputs_new = n_tokens; + } else { + // keep last output only + if (cur_token + n_tokens >= n_tokens_all) { + n_outputs_new = 1; + } + } + + // needs to happen before the graph is built + lctx.n_outputs = n_outputs_new; + } + int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; GGML_ASSERT(n_threads > 0); @@ -9611,23 +9949,37 @@ static int llama_decode_internal( struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; - if (!hparams.causal_attn) { + if (lctx.n_outputs == 0) { + // no output + res = nullptr; + embd = nullptr; + } else if (!hparams.causal_attn) { res = nullptr; // do not extract logits for embedding models such as BERT // token or sequence embeddings embd = gf->nodes[gf->n_nodes - 1]; GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0); - } else { - if (strcmp(res->name, "result_output") == 0) { - // the token embeddings could be the second to last tensor, or the third to last tensor - if (strcmp(embd->name, "result_norm") != 0) { - embd = gf->nodes[gf->n_nodes - 3]; - GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); - } - } else { - GGML_ASSERT(false && "missing result_output tensor"); + } else if (cparams.embeddings) { + // the embeddings could be in the second to last tensor, or any of the previous tensors + int i_embd = gf->n_nodes - 2; + for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) { + i_embd = gf->n_nodes - i; + if (i_embd < 0) { break; } + embd = gf->nodes[i_embd]; } + GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor"); + + // TODO: use a per-batch flag to know when to skip logits while keeping embeddings + if (!cparams.causal_attn) { + res = nullptr; // do not extract logits when not needed + // skip computing logits + // TODO: is this safe? + gf->n_nodes = i_embd + 1; + } + } else { + embd = nullptr; // do not extract embeddings when not needed + GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -9670,50 +10022,23 @@ static int llama_decode_internal( //} // extract logits - // TODO: do not compute and extract logits if only embeddings are needed - // update the graphs to skip "result_output" if logits are not needed if (res) { ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); GGML_ASSERT(backend_res != nullptr); - if (u_batch.logits) { - int32_t i_first = -1; - for (uint32_t i = 0; i < n_tokens; i++) { - if (u_batch.logits[i] && i_first == -1) { - i_first = (int32_t) i; - } - if (u_batch.logits[i] == 0 || i == n_tokens - 1) { - if (i_first != -1) { - int i_last = u_batch.logits[i] == 0 ? i : i + 1; - // extract logits for the range [i_first, i_last) - // group the requests to minimize the number of calls to the backend - ggml_backend_tensor_get_async(backend_res, res, - logits_out + n_vocab*(cur_token + i_first), - i_first*n_vocab*sizeof(float), - (i_last - i_first)*n_vocab*sizeof(float)); - i_first = -1; - } - } -#ifndef NDEBUG - logits_valid[cur_token + i] = u_batch.logits[i] != 0;; -#endif - } - } else if (lctx.logits_all) { - ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float)); -#ifndef NDEBUG - std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true); -#endif - } else { - if (cur_token + n_tokens >= n_tokens_all) { - ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float)); -#ifndef NDEBUG - logits_valid[0] = true; -#endif - } + GGML_ASSERT(lctx.logits != nullptr); + + float * logits_out = lctx.logits + n_outputs_prev*n_vocab; + const int32_t n_outputs_new = lctx.n_outputs; + + if (n_outputs_new) { + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); + GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size); + ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); } } // extract embeddings - if (cparams.embeddings && embd) { + if (embd) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); GGML_ASSERT(backend_embd != nullptr); @@ -9721,16 +10046,14 @@ static int llama_decode_internal( case LLAMA_POOLING_TYPE_NONE: { // extract token embeddings - auto & embd_out = lctx.embd; + GGML_ASSERT(lctx.embd != nullptr); + float * embd_out = lctx.embd + n_outputs_prev*n_embd; + const int32_t n_outputs_new = lctx.n_outputs; - if (u_batch.logits) { - //embd_out.resize(n_embd * n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - if (u_batch.logits[i] == 0) { - continue; - } - ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float)); - } + if (n_outputs_new) { + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); + GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_CLS: @@ -9757,6 +10080,7 @@ static int llama_decode_internal( } break; } } + n_outputs_prev += lctx.n_outputs; } // wait for the computation to finish (automatically done when obtaining the model output) @@ -12452,7 +12776,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q5_K; } else if (new_type != GGML_TYPE_Q8_0) { @@ -12463,7 +12788,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { @@ -12510,7 +12836,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = GGML_TYPE_Q2_K; } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; @@ -12529,7 +12855,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_expert == 8) { new_type = GGML_TYPE_Q5_K; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; } } @@ -12696,7 +13022,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || - new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S) { + new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || + new_type == GGML_TYPE_IQ1_M) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -12714,6 +13041,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; @@ -12796,6 +13124,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_XS: default_type = GGML_TYPE_IQ1_S; break; + case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; @@ -12818,7 +13147,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s constexpr bool use_mmap = false; #endif - llama_model_loader ml(fname_inp, use_mmap, NULL); + llama_model_kv_override * kv_overrides = nullptr; + if (params->kv_overrides) { + auto v = (std::vector*)params->kv_overrides; + kv_overrides = v->data(); + } + llama_model_loader ml(fname_inp, use_mmap, kv_overrides); ml.init_mappings(false); // no prefetching? llama_model model; @@ -12847,6 +13181,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); + if (params->kv_overrides) { + const std::vector & overrides = *(const std::vector *)params->kv_overrides; + for (auto & o : overrides) { + if (o.key[0] == 0) break; + if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { + gguf_set_val_f32(ctx_out, o.key, o.float_value); + } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { + gguf_set_val_i32(ctx_out, o.key, o.int_value); + } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { + gguf_set_val_bool(ctx_out, o.key, o.bool_value); + } else { + LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); + } + } + } + for (int i = 0; i < ml.n_tensors; ++i) { const struct ggml_tensor * meta = ml.get_tensor_meta(i); @@ -12855,21 +13205,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // TODO: avoid hardcoded tensor names - use the TN_* constants if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { ++qs.n_attention_wv; - } - else if (name.find("ffn_down") != std::string::npos) { + } else if (name.find("ffn_down") != std::string::npos) { ++qs.n_ffn_down; - } - else if (name.find("ffn_gate") != std::string::npos) { + } else if (name.find("ffn_gate") != std::string::npos) { ++qs.n_ffn_gate; - } - else if (name.find("ffn_up") != std::string::npos) { + } else if (name.find("ffn_up") != std::string::npos) { ++qs.n_ffn_up; - } - else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { + } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; } } - if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { + if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) { LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer); } @@ -12954,6 +13300,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (!params->pure && ggml_is_quantized(default_type)) { new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); } + else if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } + else if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { + new_type = params->output_tensor_type; + } // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. @@ -12986,6 +13338,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ1_S || (new_type == GGML_TYPE_IQ2_S && strcmp(tensor->name, "token_embd.weight")) || + (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); @@ -13405,6 +13758,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.only_copy =*/ false, /*.pure =*/ false, /*.imatrix =*/ nullptr, + /*.kv_overrides =*/ nullptr, }; return result; @@ -13413,7 +13767,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { size_t llama_max_devices(void) { #if defined(GGML_USE_METAL) return 1; -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) return GGML_CUDA_MAX_DEVICES; #elif defined(GGML_USE_SYCL) return GGML_SYCL_MAX_DEVICES; @@ -13433,8 +13787,8 @@ bool llama_supports_mlock(void) { } bool llama_supports_gpu_offload(void) { -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) +#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -13541,7 +13895,7 @@ struct llama_context * llama_new_context_with_model( const auto & hparams = model->hparams; auto & cparams = ctx->cparams; - // TODO: maybe add n_seq_max here too + cparams.n_seq_max = std::max(1u, params.n_seq_max); cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; cparams.yarn_ext_factor = params.yarn_ext_factor; @@ -13639,7 +13993,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_metal); } -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); @@ -13743,25 +14097,12 @@ struct llama_context * llama_new_context_with_model( // graph outputs buffer { - // resized during inference, reserve maximum - ctx->logits_size = hparams.n_vocab*cparams.n_batch; - ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0; - - const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float); - - ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size); - if (ctx->buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__); + // resized during inference when a batch uses more outputs + if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) { + LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); llama_free(ctx); return nullptr; } - ggml_backend_buffer_clear(ctx->buf_output, 0); - - - ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output); - if (params.embeddings) { - ctx->embd = ctx->logits + ctx->logits_size; - } LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(ctx->buf_output), @@ -13786,7 +14127,7 @@ struct llama_context * llama_new_context_with_model( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER; -#ifndef GGML_USE_CUBLAS +#ifndef GGML_USE_CUDA // pipeline parallelism requires support for async compute and events // currently this is only implemented in the CUDA backend pipeline_parallel = false; @@ -14278,27 +14619,33 @@ void llama_kv_cache_update(struct llama_context * ctx) { // Returns the *maximum* size of the state size_t llama_get_state_size(const struct llama_context * ctx) { + const auto & cparams = ctx->cparams; + const auto & hparams = ctx->model.hparams; + // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); const size_t s_rng = LLAMA_MAX_RNG_STATE; + const size_t s_n_outputs = sizeof(size_t); + // assume worst case for outputs although only currently set ones are serialized + const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t); const size_t s_logits_size = sizeof(size_t); - // assume worst case for logits although only currently set ones are serialized - const size_t s_logits = ctx->logits_size * sizeof(float); + const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0; const size_t s_embedding_size = sizeof(size_t); - const size_t s_embedding = ctx->embd_size * sizeof(float); + const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0; const size_t s_kv_buf_size = sizeof(size_t); const size_t s_kv_head = sizeof(uint32_t); const size_t s_kv_size = sizeof(uint32_t); const size_t s_kv_used = sizeof(uint32_t); const size_t s_kv = ctx->kv_self.total_size(); - // TODO: assume the max is more than 1 seq_id per KV cell - const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id); + const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id); const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell; const size_t s_total = ( + s_rng_size + s_rng + + s_n_outputs + + s_output_pos + s_logits_size + s_logits + s_embedding_size @@ -14373,7 +14720,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat std::ostringstream rng_ss; rng_ss << ctx->rng; - const std::string & rng_str = rng_ss.str(); + const std::string & rng_str = rng_ss.str(); const size_t rng_size = rng_str.size(); GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); @@ -14382,25 +14729,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(rng_str.data(), rng_size); } - // copy logits + // copy outputs { - const size_t logits_size = ctx->logits_size; + // Can't use ctx->n_outputs because it's not for the + // entire last batch when n_ubatch is smaller than n_batch + size_t n_outputs = 0; - data_ctx->write(&logits_size, sizeof(logits_size)); + // copy output ids + { + std::vector output_pos; - if (logits_size) { - data_ctx->write(ctx->logits, logits_size * sizeof(float)); + const size_t n_batch = ctx->cparams.n_batch; + const auto & output_ids = ctx->output_ids; + + output_pos.resize(ctx->output_size); + + // build a more compact representation of the output ids + for (size_t i = 0; i < n_batch; ++i) { + // map an output id to a position in the batch + int32_t pos = output_ids[i]; + if (pos >= 0) { + if ((size_t) pos >= n_outputs) { + n_outputs = pos + 1; + } + GGML_ASSERT((size_t) pos < ctx->output_size); + output_pos[pos] = i; + } + } + + data_ctx->write(&n_outputs, sizeof(n_outputs)); + + if (n_outputs) { + data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t)); + } } - } - // copy embeddings - { - const size_t embeddings_size = ctx->embd_size; + // copy logits + { + const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab); - data_ctx->write(&embeddings_size, sizeof(embeddings_size)); + data_ctx->write(&logits_size, sizeof(logits_size)); - if (embeddings_size) { - data_ctx->write(ctx->embd, embeddings_size * sizeof(float)); + if (logits_size) { + data_ctx->write(ctx->logits, logits_size * sizeof(float)); + } + } + + // copy embeddings + { + const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd); + + data_ctx->write(&embeddings_size, sizeof(embeddings_size)); + + if (embeddings_size) { + data_ctx->write(ctx->embd, embeddings_size * sizeof(float)); + } } } @@ -14413,9 +14796,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); - const size_t kv_buf_size = kv_self.total_size(); + // NOTE: kv_size and kv_buf_size are mostly used for sanity checks const uint32_t kv_head = llama_kv_cache_cell_max(kv_self); const uint32_t kv_size = kv_self.size; + const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head; const uint32_t kv_used = kv_self.used; data_ctx->write(&kv_buf_size, sizeof(kv_buf_size)); @@ -14424,6 +14808,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(&kv_used, sizeof(kv_used)); if (kv_buf_size) { + const size_t pre_kv_buf_size = data_ctx->get_size_written(); + std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); @@ -14453,6 +14839,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(tmp_buf.data(), tmp_buf.size()); } } + GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size); } for (uint32_t i = 0; i < kv_head; ++i) { @@ -14497,6 +14884,28 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { GGML_ASSERT(!rng_ss.fail()); } + // set output ids + { + size_t n_outputs; + std::vector output_pos; + + memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs); + + GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs)); + + if (n_outputs) { + output_pos.resize(n_outputs); + memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t)); + inp += n_outputs * sizeof(int32_t); + + for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { + int32_t id = output_pos[i]; + GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch); + ctx->output_ids[id] = i; + } + } + } + // set logits { size_t logits_size; @@ -14517,7 +14926,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size); - GGML_ASSERT(ctx->embd_size == embeddings_size); + GGML_ASSERT(ctx->embd_size >= embeddings_size); if (embeddings_size) { memcpy(ctx->embd, inp, embeddings_size * sizeof(float)); @@ -14544,8 +14953,18 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); + if (kv_self.size != kv_size) { + // the KV cache needs to be big enough to load all the KV cells from the saved state + GGML_ASSERT(kv_self.size >= kv_head); + + LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n", + __func__, kv_head, kv_size, kv_self.size); + } + if (kv_buf_size) { - GGML_ASSERT(kv_self.total_size() == kv_buf_size); + const size_t pre_kv_buf_size = inp - src; + + GGML_ASSERT(kv_self.total_size() >= kv_buf_size); for (int il = 0; il < (int) n_layer; ++il) { const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); @@ -14565,23 +14984,21 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { // v is not contiguous, copy row by row const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); inp += v_row_size; } } + GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size); } - GGML_ASSERT(kv_self.size == kv_size); + llama_kv_cache_clear(ctx); ctx->kv_self.head = kv_head; - ctx->kv_self.size = kv_size; ctx->kv_self.used = kv_used; - ctx->kv_self.cells.resize(kv_size); - for (uint32_t i = 0; i < kv_head; ++i) { llama_pos pos; size_t seq_id_size; @@ -14598,11 +15015,6 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { ctx->kv_self.cells[i].seq_id.insert(seq_id); } } - - for (uint32_t i = kv_head; i < kv_size; ++i) { - ctx->kv_self.cells[i].pos = -1; - ctx->kv_self.cells[i].seq_id.clear(); - } } const size_t nread = inp - src; @@ -14808,11 +15220,33 @@ float * llama_get_logits(struct llama_context * ctx) { } float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { - assert(ctx->logits_valid.at(i)); - llama_synchronize(ctx); - return ctx->logits + i*ctx->model.hparams.n_vocab; + try { + if (ctx->logits == nullptr) { + throw std::runtime_error("no logits"); + } + if ((size_t) i >= ctx->output_ids.size()) { + throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size())); + } + const int32_t j = ctx->output_ids[i]; + + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if ((size_t) j >= ctx->output_size) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size)); + } + + return ctx->logits + j*ctx->model.hparams.n_vocab; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ASSERT(false); +#endif + return nullptr; + } } float * llama_get_embeddings(struct llama_context * ctx) { @@ -14824,7 +15258,31 @@ float * llama_get_embeddings(struct llama_context * ctx) { float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { llama_synchronize(ctx); - return ctx->embd + i*ctx->model.hparams.n_embd; + try { + if (ctx->embd == nullptr) { + throw std::runtime_error("no embeddings"); + } + if ((size_t) i >= ctx->output_ids.size()) { + throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size())); + } + const int32_t j = ctx->output_ids[i]; + + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if ((size_t) j >= ctx->output_size) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, output_size=%lu)", j, ctx->output_size)); + } + + return ctx->embd + j*ctx->model.hparams.n_embd; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ASSERT(false); +#endif + return nullptr; + } } float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) { diff --git a/llama.h b/llama.h index da9597b67..4379a59a9 100644 --- a/llama.h +++ b/llama.h @@ -39,7 +39,7 @@ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 4 +#define LLAMA_SESSION_VERSION 5 #ifdef __cplusplus extern "C" { @@ -117,6 +117,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_XS = 32, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file @@ -285,6 +286,7 @@ extern "C" { bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool pure; // quantize all tensors to the default type void * imatrix; // pointer to importance matrix data + void * kv_overrides; // pointer to vector containing overrides } llama_model_quantize_params; // grammar types @@ -677,23 +679,29 @@ extern "C" { LLAMA_API void llama_synchronize(struct llama_context * ctx); // Token logits obtained from the last call to llama_decode() - // The logits for the last token are stored in the last row - // Logits for which llama_batch.logits[i] == 0 are undefined - // Rows: n_tokens provided with llama_batch + // The logits for which llama_batch.logits[i] != 0 are stored contiguously + // in the order they have appeared in the batch. + // Rows: number of tokens for which llama_batch.logits[i] != 0 // Cols: n_vocab LLAMA_API float * llama_get_logits(struct llama_context * ctx); // Logits for the ith token. Equivalent to: - // llama_get_logits(ctx) + i*n_vocab + // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab + // returns NULL for invalid ids. LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); - // Get all output token embeddings - // shape: [n_tokens*n_embd] (1-dimensional) + // Get all output token embeddings. + // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, + // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously + // in the order they have appeared in the batch. + // shape: [n_outputs*n_embd] + // Otherwise, returns NULL. LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); - // Get the embeddings for the ith token - // llama_get_embeddings(ctx) + i*n_embd + // Get the embeddings for the ith token. Equivalent to: + // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd // shape: [n_embd] (1-dimensional) + // returns NULL for invalid ids. LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); // Get the embeddings for a sequence id diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in index 6a6d8e39e..f842c7137 100644 --- a/scripts/LlamaConfig.cmake.in +++ b/scripts/LlamaConfig.cmake.in @@ -3,7 +3,7 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(LLAMA_BLAS @LLAMA_BLAS@) -set(LLAMA_CUBLAS @LLAMA_CUBLAS@) +set(LLAMA_CUDA @LLAMA_CUDA@) set(LLAMA_METAL @LLAMA_METAL@) set(LLAMA_MPI @LLAMA_MPI@) set(LLAMA_CLBLAST @LLAMA_CLBLAST@) @@ -27,7 +27,7 @@ if (LLAMA_BLAS) find_package(BLAS REQUIRED) endif() -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) find_package(CUDAToolkit REQUIRED) endif() diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index 331c4b9ce..d1272506c 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -23,7 +23,7 @@ fi make_opts="" if [[ "$backend" == "cuda" ]]; then - make_opts="LLAMA_CUBLAS=1" + make_opts="LLAMA_CUDA=1" fi git checkout $1 diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 6cf1ab4f3..2058ceabf 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -LLAMA_CUBLAS=1 make -j +LLAMA_CUDA=1 make -j ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b @@ -60,7 +60,7 @@ cd /workspace/llama.cpp mkdir build-cublas cd build-cublas -cmake -DLLAMA_CUBLAS=1 ../ +cmake -DLLAMA_CUDA=1 ../ make -j if [ "$1" -eq "0" ]; then @@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then # batched cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 + LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 # batched-bench cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 + LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 # parallel cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb + LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb fi @@ -204,10 +204,10 @@ fi #if [ "$1" -eq "7" ]; then # cd /workspace/llama.cpp # -# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 +# LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 #fi # more benches -#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 -#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index 30bbac321..eb6ce458e 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -380,7 +380,7 @@ fi if [[ "$backend" == "cuda" ]]; then printf "[+] Building with CUDA backend\n" - LLAMA_CUBLAS=1 make -j server $log + LLAMA_CUDA=1 make -j server $log elif [[ "$backend" == "cpu" ]]; then printf "[+] Building with CPU backend\n" make -j server $log diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 1998e1cbc..5dfea5662 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1960,7 +1960,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, + GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, };