diff --git a/.clang-tidy b/.clang-tidy index 3078beacc..952c0cca8 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -12,6 +12,7 @@ Checks: > -readability-implicit-bool-conversion, -readability-magic-numbers, -readability-uppercase-literal-suffix, + -readability-simplify-boolean-expr, clang-analyzer-*, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, performance-*, diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index 77a9ddc14..059fd2695 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build ARG CUDA_DOCKER_ARCH=all RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev COPY requirements.txt requirements.txt COPY requirements requirements @@ -26,8 +26,10 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 +# Enable cURL +ENV LLAMA_CURL=1 RUN make diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 8b9633dc4..6ecf3bcc7 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ +# Enable cURL +ENV LLAMA_CURL=1 +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + RUN make ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index cef1297d3..432fb5dad 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION as build RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev COPY requirements.txt requirements.txt COPY requirements requirements @@ -15,6 +15,9 @@ WORKDIR /app COPY . . +ENV LLAMA_CURL=1 + + RUN make ENV LC_ALL=C.utf8 diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec index 076f29695..774f63ddd 100644 --- a/.devops/llama-cpp-clblast.srpm.spec +++ b/.devops/llama-cpp-clblast.srpm.spec @@ -1,5 +1,5 @@ # SRPM for building from source and packaging an RPM for RPM-based distros. -# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages # Built and maintained by John Boero - boeroboy@gmail.com # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec similarity index 79% rename from .devops/llama-cpp-cublas.srpm.spec rename to .devops/llama-cpp-cuda.srpm.spec index f847ebb1e..ba9cb7cbb 100644 --- a/.devops/llama-cpp-cublas.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -1,5 +1,5 @@ # SRPM for building from source and packaging an RPM for RPM-based distros. -# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages # Built and maintained by John Boero - boeroboy@gmail.com # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal @@ -12,7 +12,7 @@ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # It is up to the user to install the correct vendor-specific support. -Name: llama.cpp-cublas +Name: llama.cpp-cuda Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) @@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options. %setup -n llama.cpp-master %build -make -j LLAMA_CUBLAS=1 +make -j LLAMA_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llamacppcublas -cp -p server %{buildroot}%{_bindir}/llamacppcublasserver -cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple +cp -p main %{buildroot}%{_bindir}/llamacppcuda +cp -p server %{buildroot}%{_bindir}/llamacppcudaserver +cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple mkdir -p %{buildroot}/usr/lib/systemd/system -%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacublas.service +%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacuda.service [Unit] Description=Llama.cpp server, CPU only (no GPU support in this build). After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target @@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t [Service] Type=simple EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS +ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -67,10 +67,10 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llamacppcublas -%{_bindir}/llamacppcublasserver -%{_bindir}/llamacppcublassimple -/usr/lib/systemd/system/llamacublas.service +%{_bindir}/llamacppcuda +%{_bindir}/llamacppcudaserver +%{_bindir}/llamacppcudasimple +/usr/lib/systemd/system/llamacuda.service %config /etc/sysconfig/llama %pre diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec index 446213d69..1d9e4f425 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/llama-cpp.srpm.spec @@ -1,5 +1,5 @@ # SRPM for building from source and packaging an RPM for RPM-based distros. -# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages # Built and maintained by John Boero - boeroboy@gmail.com # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index 2b7faf7c1..b937a4829 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -20,8 +20,8 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 RUN make diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile new file mode 100644 index 000000000..572e5d8ea --- /dev/null +++ b/.devops/main-intel.Dockerfile @@ -0,0 +1,28 @@ +ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 + +FROM intel/oneapi-basekit:$ONEAPI_VERSION as build + +ARG LLAMA_SYCL_F16=OFF +RUN apt-get update && \ + apt-get install -y git + +WORKDIR /app + +COPY . . + +RUN mkdir build && \ + cd build && \ + if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ + echo "LLAMA_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ + fi && \ + cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + cmake --build . --config Release --target main + +FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime + +COPY --from=build /app/build/bin/main /main + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/main" ] diff --git a/.devops/main-vulkan.Dockerfile b/.devops/main-vulkan.Dockerfile new file mode 100644 index 000000000..bca460365 --- /dev/null +++ b/.devops/main-vulkan.Dockerfile @@ -0,0 +1,29 @@ +ARG UBUNTU_VERSION=jammy + +FROM ubuntu:$UBUNTU_VERSION as build + +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget + +# Install Vulkan SDK +RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + apt update -y && \ + apt-get install -y vulkan-sdk + +# Build it +WORKDIR /app +COPY . . +RUN mkdir build && \ + cd build && \ + cmake .. -DLLAMA_VULKAN=1 && \ + cmake --build . --config Release --target main + +# Clean up +WORKDIR / +RUN cp /app/build/bin/main /main && \ + rm -rf /app + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/main" ] diff --git a/.devops/nix/docker.nix b/.devops/nix/docker.nix new file mode 100644 index 000000000..d607b4575 --- /dev/null +++ b/.devops/nix/docker.nix @@ -0,0 +1,37 @@ +{ + lib, + dockerTools, + buildEnv, + llama-cpp, + interactive ? true, + coreutils, +}: + +# A tar that can be fed into `docker load`: +# +# $ nix build .#llamaPackages.docker +# $ docker load < result + +# For details and variations cf. +# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage +# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922 +# - https://nixery.dev/ + +# Approximate (compressed) sizes, at the time of writing, are: +# +# .#llamaPackages.docker: 125M; +# .#llamaPackagesCuda.docker: 537M; +# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M. + +dockerTools.buildLayeredImage { + name = llama-cpp.pname; + tag = "latest"; + + contents = + [ llama-cpp ] + ++ lib.optionals interactive [ + coreutils + dockerTools.binSh + dockerTools.caCertificates + ]; +} diff --git a/.devops/nix/nixpkgs-instances.nix b/.devops/nix/nixpkgs-instances.nix index 6e9872b28..4a2f81c4b 100644 --- a/.devops/nix/nixpkgs-instances.nix +++ b/.devops/nix/nixpkgs-instances.nix @@ -7,6 +7,18 @@ { system, ... }: { _module.args = { + # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs + # again, the below creates several nixpkgs instances which the + # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`. + # + # This is currently "slow" and "expensive", on a certain scale. + # This also isn't "right" in that this hinders dependency injection at + # the level of flake inputs. This might get removed in the foreseeable + # future. + # + # Note that you can use these expressions without Nix + # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point). + pkgsCuda = import inputs.nixpkgs { inherit system; # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc, diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 43bdbd755..2c0ae4e2a 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -1,31 +1,43 @@ { lib, + glibc, config, stdenv, mkShell, + runCommand, cmake, ninja, pkg-config, git, python3, mpi, - openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations + blas, cudaPackages, darwin, rocmPackages, + vulkan-headers, + vulkan-loader, clblast, useBlas ? builtins.all (x: !x) [ useCuda useMetalKit useOpenCL useRocm - ], + useVulkan + ] && blas.meta.available, useCuda ? config.cudaSupport, useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL, useMpi ? false, # Increases the runtime closure size by ~700M useOpenCL ? false, useRocm ? config.rocmSupport, + useVulkan ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake + + # It's necessary to consistently use backendStdenv when building with CUDA support, + # otherwise we get libstdc++ errors downstream. + effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv, + enableStatic ? effectiveStdenv.hostPlatform.isStatic, + precompileMetalShaders ? false }@inputs: let @@ -37,10 +49,7 @@ let versionOlder ; - # It's necessary to consistently use backendStdenv when building with CUDA support, - # otherwise we get libstdc++ errors downstream. stdenv = throw "Use effectiveStdenv instead"; - effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv; suffices = lib.optionals useBlas [ "BLAS" ] @@ -48,7 +57,8 @@ let ++ lib.optionals useMetalKit [ "MetalKit" ] ++ lib.optionals useMpi [ "MPI" ] ++ lib.optionals useOpenCL [ "OpenCL" ] - ++ lib.optionals useRocm [ "ROCm" ]; + ++ lib.optionals useRocm [ "ROCm" ] + ++ lib.optionals useVulkan [ "Vulkan" ]; pnameSuffix = strings.optionalString (suffices != [ ]) @@ -57,10 +67,15 @@ let strings.optionalString (suffices != [ ]) ", accelerated with ${strings.concatStringsSep ", " suffices}"; + executableSuffix = effectiveStdenv.hostPlatform.extensions.executable; + # TODO: package the Python in this repository in a Nix-like way. # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo # is PEP 517-compatible, and ensure the correct .dist-info is generated. # https://peps.python.org/pep-0517/ + # + # TODO: Package up each Python script or service appropriately, by making + # them into "entrypoints" llama-python = python3.withPackages ( ps: [ ps.numpy @@ -73,11 +88,17 @@ let ps: [ ps.numpy ps.sentencepiece + ps.tiktoken ps.torchWithoutCuda ps.transformers ] ); + xcrunHost = runCommand "xcrunHost" {} '' + mkdir -p $out/bin + ln -s /usr/bin/xcrun $out/bin + ''; + # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64 # separately darwinBuildInputs = @@ -107,6 +128,11 @@ let hipblas rocblas ]; + + vulkanBuildInputs = [ + vulkan-headers + vulkan-loader + ]; in effectiveStdenv.mkDerivation ( @@ -114,27 +140,40 @@ effectiveStdenv.mkDerivation ( pname = "llama-cpp${pnameSuffix}"; version = llamaVersion; + # Note: none of the files discarded here are visible in the sandbox or + # affect the output hash. This also means they can be modified without + # triggering a rebuild. src = lib.cleanSourceWith { filter = name: type: - !(builtins.any (_: _) [ + let + noneOf = builtins.all (x: !x); + baseName = baseNameOf name; + in + noneOf [ (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths - (name == "README.md") # Ignore *.md changes whe computing outPaths - (lib.hasPrefix "." name) # Skip hidden files and directories - ]); + (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths + (lib.hasPrefix "." baseName) # Skip hidden files and directories + (baseName == "flake.lock") + ]; src = lib.cleanSource ../../.; }; postPatch = '' substituteInPlace ./ggml-metal.m \ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - - # TODO: Package up each Python script or service appropriately. - # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`, - # we could make those *.py into setuptools' entrypoints - substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python" + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" ''; + # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, + # `default.metallib` may be compiled with Metal compiler from XCode + # and we need to escape sandbox on MacOS to access Metal compiler. + # `xcrun` is used find the path of the Metal compiler, which is varible + # and not on $PATH + # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion + __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; + nativeBuildInputs = [ cmake @@ -148,6 +187,11 @@ effectiveStdenv.mkDerivation ( # TODO: Replace with autoAddDriverRunpath # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged cudaPackages.autoAddOpenGLRunpathHook + ] + ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ + glibc.static + ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ + xcrunHost ]; buildInputs = @@ -155,20 +199,24 @@ effectiveStdenv.mkDerivation ( ++ optionals useCuda cudaBuildInputs ++ optionals useMpi [ mpi ] ++ optionals useOpenCL [ clblast ] - ++ optionals useRocm rocmBuildInputs; + ++ optionals useRocm rocmBuildInputs + ++ optionals useBlas [ blas ] + ++ optionals useVulkan vulkanBuildInputs; cmakeFlags = [ - (cmakeBool "LLAMA_NATIVE" true) + (cmakeBool "LLAMA_NATIVE" false) (cmakeBool "LLAMA_BUILD_SERVER" true) - (cmakeBool "BUILD_SHARED_LIBS" true) + (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic)) (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "LLAMA_BLAS" useBlas) (cmakeBool "LLAMA_CLBLAST" useOpenCL) - (cmakeBool "LLAMA_CUBLAS" useCuda) + (cmakeBool "LLAMA_CUDA" useCuda) (cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_MPI" useMpi) + (cmakeBool "LLAMA_VULKAN" useVulkan) + (cmakeBool "LLAMA_STATIC" enableStatic) ] ++ optionals useCuda [ ( @@ -188,14 +236,16 @@ effectiveStdenv.mkDerivation ( # Should likely use `rocmPackages.clr.gpuTargets`. "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" ] - ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ] - ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ]; + ++ optionals useMetalKit [ + (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") + (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) + ]; # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # if they haven't been added yet. postInstall = '' - mv $out/bin/main $out/bin/llama - mv $out/bin/server $out/bin/llama-server + mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix} + mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix} mkdir -p $out/include cp $src/llama.h $out/include/ ''; @@ -209,6 +259,7 @@ effectiveStdenv.mkDerivation ( useMpi useOpenCL useRocm + useVulkan ; shell = mkShell { @@ -216,6 +267,9 @@ effectiveStdenv.mkDerivation ( description = "contains numpy and sentencepiece"; buildInputs = [ llama-python ]; inputsFrom = [ finalAttrs.finalPackage ]; + shellHook = '' + addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib" + ''; }; shell-extra = mkShell { diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix index 7932ac1e8..78530c9e8 100644 --- a/.devops/nix/scope.nix +++ b/.devops/nix/scope.nix @@ -4,9 +4,16 @@ llamaVersion ? "0.0.0", }: +# We're using `makeScope` instead of just writing out an attrset +# because it allows users to apply overlays later using `overrideScope'`. +# Cf. https://noogle.dev/f/lib/makeScope + lib.makeScope newScope ( self: { inherit llamaVersion; llama-cpp = self.callPackage ./package.nix { }; + docker = self.callPackage ./docker.nix { }; + docker-min = self.callPackage ./docker.nix { interactive = false; }; + sif = self.callPackage ./sif.nix { }; } ) diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix new file mode 100644 index 000000000..7a5e1dd0f --- /dev/null +++ b/.devops/nix/sif.nix @@ -0,0 +1,27 @@ +{ + lib, + singularity-tools, + llama-cpp, + bashInteractive, + interactive ? false, +}: + +let + optionalInt = cond: x: if cond then x else 0; +in +singularity-tools.buildImage rec { + inherit (llama-cpp) name; + contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ]; + + # These are excessive (but safe) for most variants. Building singularity + # images requires superuser privileges, so we build them inside a VM in a + # writable image of pre-determined size. + # + # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846 + # + # Expected image sizes: + # - cpu/blas: 150M, + # - cuda, all gencodes: 560M, + diskSize = 4096 + optionalInt llama-cpp.useRocm 16384; + memSize = diskSize; +} diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile new file mode 100644 index 000000000..59a52ba21 --- /dev/null +++ b/.devops/server-cuda.Dockerfile @@ -0,0 +1,37 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=11.7.1 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +# Target the CUDA runtime image +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +ARG CUDA_DOCKER_ARCH=all + +RUN apt-get update && \ + apt-get install -y build-essential git libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} +# Enable CUDA +ENV LLAMA_CUDA=1 +# Enable cURL +ENV LLAMA_CURL=1 + +RUN make + +FROM ${BASE_CUDA_RUN_CONTAINER} as runtime + +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + +COPY --from=build /app/server /server + +ENTRYPOINT [ "/server" ] diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile new file mode 100644 index 000000000..304487335 --- /dev/null +++ b/.devops/server-intel.Dockerfile @@ -0,0 +1,31 @@ +ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 + +FROM intel/oneapi-basekit:$ONEAPI_VERSION as build + +ARG LLAMA_SYCL_F16=OFF +RUN apt-get update && \ + apt-get install -y git libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +RUN mkdir build && \ + cd build && \ + if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ + echo "LLAMA_SYCL_F16 is set" && \ + export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ + fi && \ + cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake --build . --config Release --target server + +FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime + +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + +COPY --from=build /app/build/bin/server /server + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/server" ] diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile new file mode 100644 index 000000000..c02a31dd8 --- /dev/null +++ b/.devops/server-rocm.Dockerfile @@ -0,0 +1,50 @@ +ARG UBUNTU_VERSION=22.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=5.6 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +FROM ${BASE_ROCM_DEV_CONTAINER} as build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +ARG ROCM_DOCKER_ARCH=\ + gfx803 \ + gfx900 \ + gfx906 \ + gfx908 \ + gfx90a \ + gfx1010 \ + gfx1030 \ + gfx1100 \ + gfx1101 \ + gfx1102 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Set nvcc architecture +ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +ENV LLAMA_HIPBLAS=1 +ENV CC=/opt/rocm/llvm/bin/clang +ENV CXX=/opt/rocm/llvm/bin/clang++ + +# Enable cURL +ENV LLAMA_CURL=1 +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + +RUN make + +ENTRYPOINT [ "/app/server" ] diff --git a/.devops/server-vulkan.Dockerfile b/.devops/server-vulkan.Dockerfile new file mode 100644 index 000000000..7e5a5283b --- /dev/null +++ b/.devops/server-vulkan.Dockerfile @@ -0,0 +1,33 @@ +ARG UBUNTU_VERSION=jammy + +FROM ubuntu:$UBUNTU_VERSION as build + +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget + +# Install Vulkan SDK +RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + apt update -y && \ + apt-get install -y vulkan-sdk + +# Install cURL +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + +# Build it +WORKDIR /app +COPY . . +RUN mkdir build && \ + cd build && \ + cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \ + cmake --build . --config Release --target server + +# Clean up +WORKDIR / +RUN cp /app/build/bin/server /server && \ + rm -rf /app + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/server" ] diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile new file mode 100644 index 000000000..be964e0e8 --- /dev/null +++ b/.devops/server.Dockerfile @@ -0,0 +1,25 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION as build + +RUN apt-get update && \ + apt-get install -y build-essential git libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +ENV LLAMA_CURL=1 + +RUN make + +FROM ubuntu:$UBUNTU_VERSION as runtime + +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + +COPY --from=build /app/server /server + +ENV LC_ALL=C.utf8 + +ENTRYPOINT [ "/server" ] diff --git a/.ecrc b/.ecrc index b682057dd..a3351f4e6 100644 --- a/.ecrc +++ b/.ecrc @@ -1,4 +1,5 @@ { + "Exclude": ["^\\.gitmodules$"], "Disable": { "IndentSize": true } diff --git a/.flake8 b/.flake8 index 113ca5fd3..18fba2c15 100644 --- a/.flake8 +++ b/.flake8 @@ -1,2 +1,3 @@ [flake8] max-line-length = 125 +ignore = W503 diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md index ce69e6395..49812832c 100644 --- a/.github/ISSUE_TEMPLATE/bug.md +++ b/.github/ISSUE_TEMPLATE/bug.md @@ -7,3 +7,5 @@ assignees: '' --- Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. + +If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests). diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml new file mode 100644 index 000000000..758796632 --- /dev/null +++ b/.github/workflows/bench.yml @@ -0,0 +1,300 @@ +# Benchmark +name: Benchmark + +on: + workflow_dispatch: + inputs: + gpu-series: + description: 'Azure GPU series to run with' + required: true + type: choice + options: + - Standard_NC4as_T4_v3 + - Standard_NC24ads_A100_v4 + - Standard_NC80adis_H100_v5 + sha: + description: 'Commit SHA1 to build' + required: false + type: string + duration: + description: 'Duration of the bench' + type: string + default: 10m + + push: + branches: + - master + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + pull_request_target: + types: [opened, synchronize, reopened] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + schedule: + - cron: '04 2 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }} + cancel-in-progress: true + +jobs: + bench-server-baseline: + runs-on: Standard_NC4as_T4_v3 + env: + RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it + N_USERS: 8 + DURATION: 10m + + strategy: + matrix: + model: [phi-2] + ftype: [q4_0, q8_0, f16] + include: + - model: phi-2 + ftype: q4_0 + pr_comment_enabled: "true" + + if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Install python env + id: pipenv + run: | + cd examples/server/bench + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + + - name: Prometheus + id: install_prometheus + run: | + wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz + tar xzf prometheus*.tar.gz --strip-components=1 + ./prometheus --config.file=examples/server/bench/prometheus.yml & + while ! nc -z localhost 9090; do + sleep 0.1 + done + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.21' + + - name: Install k6 and xk6-sse + id: k6_installation + run: | + cd examples/server/bench + go install go.k6.io/xk6/cmd/xk6@latest + xk6 build master \ + --with github.com/phymbert/xk6-sse + + - name: Build + id: cmake_build + run: | + set -eux + mkdir build + cd build + cmake .. \ + -DLLAMA_NATIVE=OFF \ + -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ + -DLLAMA_CUBLAS=ON \ + -DCUDAToolkit_ROOT=/usr/local/cuda \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=75 \ + -DLLAMA_FATAL_WARNINGS=OFF \ + -DLLAMA_ALL_WARNINGS=OFF \ + -DCMAKE_BUILD_TYPE=Release; + cmake --build . --config Release -j $(nproc) --target server + + - name: Download the dataset + id: download_dataset + run: | + cd examples/server/bench + wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + - name: Server bench + id: server_bench + run: | + set -eux + + cd examples/server/bench + source venv/bin/activate + python bench.py \ + --runner-label ${{ env.RUNNER_LABEL }} \ + --name ${{ github.job }} \ + --branch ${{ github.head_ref || github.ref_name }} \ + --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ + --scenario script.js \ + --duration ${{ github.event.inputs.duration || env.DURATION }} \ + --hf-repo ggml-org/models \ + --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ + --model-path-prefix /models \ + --parallel ${{ env.N_USERS }} \ + -ngl 33 \ + --batch-size 2048 \ + --ubatch-size 256 \ + --ctx-size 16384 \ + --n-prompts 1000 \ + --max-prompt-tokens 1024 \ + --max-tokens 2048 + + cat results.github.env >> $GITHUB_ENV + + # Remove dataset as we do not want it in the artefact + rm ShareGPT_V3_unfiltered_cleaned_split.json + + - uses: actions/upload-artifact@v4 + with: + name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + compression-level: 9 + path: | + examples/server/bench/*.jpg + examples/server/bench/*.json + examples/server/bench/*.log + + - name: Commit status + uses: Sibz/github-status-action@v1 + with: + authToken: ${{secrets.GITHUB_TOKEN}} + sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} + context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + description: | + ${{ env.BENCH_RESULTS }} + state: 'success' + + - name: Upload benchmark images + uses: devicons/public-upload-to-imgur@v2.2.2 + continue-on-error: true # Important as it looks unstable: 503 + id: imgur_step + with: + client_id: ${{secrets.IMGUR_CLIENT_ID}} + path: | + examples/server/bench/prompt_tokens_seconds.jpg + examples/server/bench/predicted_tokens_seconds.jpg + examples/server/bench/kv_cache_usage_ratio.jpg + examples/server/bench/requests_processing.jpg + + - name: Extract mermaid + id: set_mermaid + run: | + set -eux + + cd examples/server/bench + PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) + echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) + echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) + echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV + echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + REQUESTS_PROCESSING=$(cat requests_processing.mermaid) + echo "REQUESTS_PROCESSING<> $GITHUB_ENV + echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Extract image url + id: extract_image_url + continue-on-error: true + run: | + set -eux + + echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV + echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV + echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV + echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV + + - name: Comment PR + uses: mshick/add-pr-comment@v2 + id: comment_pr + if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} + with: + message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + message: | +

+ + 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + +

+ +
+ + Expand details for performance related PR only + + - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} + - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} + - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s + - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s + - ${{ env.BENCH_GRAPH_XLABEL }} + + +

+ + prompt_tokens_seconds + +

+ + More + + ```mermaid + ${{ env.PROMPT_TOKENS_SECONDS }} + ``` + +
+ + predicted_tokens_seconds + +
+ More + + ```mermaid + ${{ env.PREDICTED_TOKENS_SECONDS }} + ``` + +
+ +

+ +
+ + Details + +

+ + kv_cache_usage_ratio + +

+ More + + ```mermaid + ${{ env.KV_CACHE_USAGE_RATIO }} + ``` + +
+ + requests_processing + +
+ More + + ```mermaid + ${{ env.REQUESTS_PROCESSING }} + ``` + +
+ +

+
+
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0a28a1111..ff7238aba 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,19 +15,138 @@ on: types: [opened, synchronize, reopened] paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 GGML_N_THREADS: 1 jobs: - ubuntu-focal-make: - runs-on: ubuntu-20.04 + macOS-latest-cmake-arm64: + runs-on: macos-14 steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + cp LICENSE ./build/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip + name: llama-bin-macos-arm64.zip + + macOS-latest-cmake-x64: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + cp LICENSE ./build/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip + name: llama-bin-macos-x64.zip + + ubuntu-focal-make: + runs-on: ubuntu-20.04 + env: + LLAMA_NODE_AVAILABLE: true + LLAMA_PYTHON_AVAILABLE: true + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -35,8 +154,18 @@ jobs: sudo apt-get update sudo apt-get install build-essential gcc-8 + - uses: actions/setup-node@v4 + with: + node-version: "20" + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Build id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 run: | CC=gcc-8 make -j $(nproc) @@ -46,13 +175,35 @@ jobs: CC=gcc-8 make tests -j $(nproc) make test -j $(nproc) + ubuntu-focal-make-curl: + runs-on: ubuntu-20.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev + + - name: Build + id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 + LLAMA_CURL: 1 + run: | + CC=gcc-8 make -j $(nproc) + ubuntu-latest-cmake: runs-on: ubuntu-latest steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -65,49 +216,60 @@ jobs: run: | mkdir build cd build - cmake .. + cmake .. -DLLAMA_FATAL_WARNINGS=ON cmake --build . --config Release -j $(nproc) - name: Test id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test + - name: Test llama2c conversion + id: llama2c_test run: | cd build - ctest --verbose --timeout 900 + echo "Fetch tokenizer" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin + echo "Fetch llama2c model" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin + ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf + ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + +# ubuntu-latest-cmake-sanitizer: +# runs-on: ubuntu-latest +# +# continue-on-error: true +# +# strategy: +# matrix: +# sanitizer: [ADDRESS, THREAD, UNDEFINED] +# build_type: [Debug, Release] +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v4 +# +# - name: Dependencies +# id: depends +# run: | +# sudo apt-get update +# sudo apt-get install build-essential +# +# - name: Build +# id: cmake_build +# run: | +# mkdir build +# cd build +# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} +# cmake --build . --config ${{ matrix.build_type }} -j $(nproc) +# +# - name: Test +# id: cmake_test +# run: | +# cd build +# ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest @@ -121,7 +283,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -141,7 +303,111 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest -L main --verbose + + ubuntu-22-cmake-vulkan: + runs-on: ubuntu-22.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential libvulkan-dev + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake -DLLAMA_VULKAN=ON .. + cmake --build . --config Release -j $(nproc) + + ubuntu-22-cmake-sycl: + runs-on: ubuntu-22.04 + + continue-on-error: true + + steps: + - uses: actions/checkout@v2 + + - name: add oneAPI to apt + shell: bash + run: | + cd /tmp + wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" + + - name: install oneAPI dpcpp compiler + shell: bash + run: | + sudo apt update + sudo apt install intel-oneapi-compiler-dpcpp-cpp + + - name: install oneAPI MKL library + shell: bash + run: | + sudo apt install intel-oneapi-mkl-devel + + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + mkdir build + cd build + cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx .. + cmake --build . --config Release -j $(nproc) + + ubuntu-22-cmake-sycl-fp16: + runs-on: ubuntu-22.04 + + continue-on-error: true + + steps: + - uses: actions/checkout@v2 + + - name: add oneAPI to apt + shell: bash + run: | + cd /tmp + wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB + sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main" + + - name: install oneAPI dpcpp compiler + shell: bash + run: | + sudo apt update + sudo apt install intel-oneapi-compiler-dpcpp-cpp + + - name: install oneAPI MKL library + shell: bash + run: | + sudo apt install intel-oneapi-mkl-devel + + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + mkdir build + cd build + cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON .. + cmake --build . --config Release -j $(nproc) # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. @@ -152,7 +418,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -162,6 +428,8 @@ jobs: - name: Build id: make_build + env: + LLAMA_FATAL_WARNINGS: 1 run: | LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu) @@ -181,7 +449,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -195,14 +463,14 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_METAL=OFF .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 macOS-latest-cmake-ios: runs-on: macos-latest @@ -225,6 +493,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DLLAMA_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -253,6 +522,7 @@ jobs: mkdir build cd build cmake -G Xcode .. \ + -DLLAMA_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_SERVER=OFF \ @@ -295,7 +565,8 @@ jobs: OPENBLAS_VERSION: 0.3.23 OPENCL_VERSION: 2023.04.17 CLBLAST_VERSION: 1.6.0 - SDE_VERSION: 9.21.1-2023-04-24 + SDE_VERSION: 9.33.0-2024-01-07 + VULKAN_VERSION: 1.3.261.1 strategy: matrix: @@ -312,14 +583,26 @@ jobs: defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' - build: 'openblas' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' + - build: 'kompute' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' + - build: 'vulkan' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON' + - build: 'arm64' + defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Clone Kompute submodule + id: clone_kompute + if: ${{ matrix.build == 'kompute' }} + run: | + git submodule update --init kompute + - name: Download OpenCL SDK id: get_opencl if: ${{ matrix.build == 'clblast' }} @@ -354,6 +637,15 @@ jobs: $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll + - name: Install Vulkan SDK + id: get_vulkan + if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }} + run: | + curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" + & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install + Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" + Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" + - name: Build id: cmake_build run: | @@ -391,22 +683,23 @@ jobs: - name: Test id: cmake_test - if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512 + # not all machines have native AVX-512 + if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} run: | cd build - ctest -C Release --verbose --timeout 900 + ctest -L main -C Release --verbose --timeout 900 - name: Test (Intel SDE) id: cmake_test_sde if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation run: | - curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz" + curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" # for some weird reason windows tar doesn't like sde tar.xz 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build - & $sde -future -- ctest -C Release --verbose --timeout 900 + & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - name: Determine tag name id: tag @@ -430,23 +723,23 @@ jobs: - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - path: | - llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip + path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip + name: llama-bin-win-${{ matrix.build }}-x64.zip - windows-latest-cmake-cublas: + windows-latest-cmake-cuda: runs-on: windows-latest strategy: matrix: cuda: ['12.2.0', '11.7.1'] - build: ['cublas'] + build: ['cuda'] steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -462,7 +755,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON + cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name @@ -486,10 +779,10 @@ jobs: - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - path: | - llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip + path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip + name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip - name: Copy and pack Cuda runtime run: | @@ -500,26 +793,101 @@ jobs: - name: Upload Cuda runtime if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - path: | - cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + + windows-latest-cmake-sycl: + runs-on: windows-latest + + defaults: + run: + shell: bash + + env: + WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install + run: scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + + - name: Build + id: cmake_build + run: examples/sycl/win-build-sycl.bat + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip + name: llama-bin-win-sycl-x64.zip ios-xcode-build: runs-on: macos-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build Xcode project run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build + android-build: + runs-on: ubuntu-latest + + steps: + - name: Clone + uses: actions/checkout@v4 + + - name: Set up JDK + uses: actions/setup-java@v3 + with: + java-version: 17 + distribution: zulu + + - name: Setup Android SDK + uses: android-actions/setup-android@v3 + with: + log-accepted-android-sdk-licenses: false + + - name: Build + run: | + cd examples/llama.android + + ./gradlew build --no-daemon + # freeBSD-latest: # runs-on: macos-12 # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Build # uses: cross-platform-actions/action@v0.19.0 @@ -543,12 +911,14 @@ jobs: - macOS-latest-make - macOS-latest-cmake - windows-latest-cmake - - windows-latest-cmake-cublas + - windows-latest-cmake-cuda + - macOS-latest-cmake-arm64 + - macOS-latest-cmake-x64 steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -567,7 +937,7 @@ jobs: - name: Download artifacts id: download-artifact - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 - name: Create release id: create_release @@ -608,7 +978,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # run: | @@ -632,7 +1002,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # run: | @@ -656,7 +1026,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # run: | @@ -686,7 +1056,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Add msbuild to PATH # uses: microsoft/setup-msbuild@v1 @@ -702,7 +1072,7 @@ jobs: # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }} # # - name: Upload binaries -# uses: actions/upload-artifact@v1 +# uses: actions/upload-artifact@v4 # with: # name: llama-bin-${{ matrix.arch }} # path: build/bin/${{ matrix.build }} @@ -725,7 +1095,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Add msbuild to PATH # uses: microsoft/setup-msbuild@v1 @@ -757,7 +1127,7 @@ jobs: # # - name: Upload binaries # if: matrix.blas == 'ON' -# uses: actions/upload-artifact@v1 +# uses: actions/upload-artifact@v4 # with: # name: llama-blas-bin-${{ matrix.arch }} # path: build/bin/${{ matrix.build }} @@ -771,7 +1141,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # run: | diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml new file mode 100644 index 000000000..7f21daec0 --- /dev/null +++ b/.github/workflows/close-issue.yml @@ -0,0 +1,23 @@ +name: Close inactive issues +on: + schedule: + - cron: "42 0 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + exempt-issue-labels: "refactor,help wanted,good first issue,research" + days-before-issue-stale: 30 + days-before-issue-close: 14 + stale-issue-label: "stale" + close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." + days-before-pr-stale: -1 + days-before-pr-close: -1 + operations-per-run: 10000 + repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 392db8a08..f12c558f8 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -5,12 +5,16 @@ env: GGML_NLOOP: 3 GGML_N_THREADS: 1 +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: run: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies run: | diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 87904b75e..eefd87878 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -15,6 +15,10 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: push_to_registry: name: Push Docker image to Docker Hub @@ -28,16 +32,21 @@ jobs: config: - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" } # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I # have disabled them for now until the reason why # is understood. - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" } - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } + - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } + - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } steps: - name: Check out the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up QEMU uses: docker/setup-qemu-action@v2 diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index b4e535acf..ae86e9927 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -1,6 +1,12 @@ name: EditorConfig Checker on: + workflow_dispatch: # allows manual triggering + inputs: + create_release: + description: 'Create new release' + required: true + type: boolean push: branches: - master @@ -8,10 +14,14 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: editorconfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: editorconfig-checker/action-editorconfig-checker@main - run: editorconfig-checker diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml index 57db17512..3ca4d3058 100644 --- a/.github/workflows/gguf-publish.yml +++ b/.github/workflows/gguf-publish.yml @@ -24,9 +24,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: '3.9.x' - name: Install dependencies diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml new file mode 100644 index 000000000..4aa4b2379 --- /dev/null +++ b/.github/workflows/nix-ci-aarch64.yml @@ -0,0 +1,65 @@ +name: Nix aarch64 builds + +on: + workflow_dispatch: # allows manual triggering + schedule: + # Rebuild daily rather than on every push because QEMU is expensive (e.g. + # 1.5h instead of minutes with the cold cache). + # + # randint(0, 59), randint(0, 23) + - cron: '26 12 * * *' + # But also rebuild if we touched any of the Nix expressions: + push: + branches: + - master + paths: ['**/*.nix', 'flake.lock'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['**/*.nix', 'flake.lock'] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +jobs: + nix-build-aarch64: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install QEMU + # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654 + run: | + sudo apt-get update + sudo apt-get install -y qemu-user-static qemu-system-aarch64 + sudo usermod -a -G kvm $USER + - name: Install Nix + uses: DeterminateSystems/nix-installer-action@v9 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + extra-conf: | + extra-platforms = aarch64-linux + extra-system-features = nixos-test kvm + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + - uses: DeterminateSystems/magic-nix-cache-action@v2 + with: + upstream-cache: https://${{ matrix.cachixName }}.cachix.org + - name: Set-up cachix to push the results to + uses: cachix/cachix-action@v13 + with: + authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' + name: llama-cpp + - name: Show all output paths + run: > + nix run github:nix-community/nix-eval-jobs + -- --gc-roots-dir gcroot + --flake + ".#packages.aarch64-linux" + - name: Build + run: > + nix run github:Mic92/nix-fast-build + -- --skip-cached --no-nom + --systems aarch64-linux + --flake + ".#checks.aarch64-linux" diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index a38c6ead4..8955f38d0 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -5,10 +5,12 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix'] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true jobs: nix-eval: @@ -25,8 +27,8 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} extra-conf: | - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -39,7 +41,6 @@ jobs: --flake ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)" nix-build: - if: ${{ vars.CACHIX_NAME != '' }} strategy: fail-fast: false matrix: @@ -53,8 +54,8 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} extra-conf: | - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= + extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org + extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - uses: DeterminateSystems/magic-nix-cache-action@v2 with: upstream-cache: https://${{ matrix.cachixName }}.cachix.org @@ -62,51 +63,10 @@ jobs: uses: cachix/cachix-action@v13 with: authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: ${{ vars.CACHIX_NAME }} + name: llama-cpp - name: Build run: > nix run github:Mic92/nix-fast-build -- --skip-cached --no-nom --flake ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)" - nix-build-aarch64: - if: ${{ vars.CACHIX_NAME != '' }} - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install QEMU - # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654 - run: | - sudo apt-get install -y qemu-user-static qemu-system-aarch64 - sudo usermod -a -G kvm $USER - - name: Install Nix - uses: DeterminateSystems/nix-installer-action@v9 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - extra-conf: | - extra-platforms = aarch64-linux - extra-system-features = nixos-test kvm - extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org - extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E= - - uses: DeterminateSystems/magic-nix-cache-action@v2 - with: - upstream-cache: https://${{ matrix.cachixName }}.cachix.org - - name: Set-up cachix to push the results to - uses: cachix/cachix-action@v13 - with: - authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}' - name: ${{ vars.CACHIX_NAME }} - - name: Show all output paths - run: > - nix run github:nix-community/nix-eval-jobs - -- --gc-roots-dir gcroot - --flake - ".#packages.aarch64-linux" - - name: Build - run: > - nix run github:Mic92/nix-fast-build - -- --skip-cached --no-nom - --systems aarch64-linux - --flake - ".#checks.aarch64-linux" diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml index fa9360841..3a6a96e26 100644 --- a/.github/workflows/nix-flake-update.yml +++ b/.github/workflows/nix-flake-update.yml @@ -19,4 +19,4 @@ jobs: pr-labels: | nix pr-reviewers: philiptaron,SomeoneSerge - token: ${{ secrets.GITHUB_TOKEN }} + token: ${{ secrets.FLAKE_TOKEN }} diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 92e1108b3..4e0374fc6 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -3,27 +3,33 @@ name: Python check requirements.txt on: push: paths: + - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - 'requirements.txt' - 'requirements/*.txt' pull_request: paths: + - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - 'requirements.txt' - 'requirements/*.txt' +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: python-check-requirements: runs-on: ubuntu-latest name: check-requirements steps: - name: Check out source repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python environment - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" - name: Run check-requirements.sh script - run: bash scripts/check-requirements.sh nocleanup + run: bash scripts/check-requirements.sh diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index 56d17b66c..f4ae65495 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -2,19 +2,23 @@ name: flake8 Lint on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: flake8-lint: runs-on: ubuntu-latest name: Lint steps: - name: Check out source repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python environment - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" - name: flake8 Lint uses: py-actions/flake8@v2 with: - ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704" + ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" exclude: "examples/*,examples/*/**,*/**/__init__.py" diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml new file mode 100644 index 000000000..521cc29ae --- /dev/null +++ b/.github/workflows/server.yml @@ -0,0 +1,178 @@ +# Server build and tests +name: Server + +on: + workflow_dispatch: # allows manual triggering + inputs: + sha: + description: 'Commit SHA1 to build' + required: false + type: string + slow_tests: + description: 'Run slow tests' + required: true + type: boolean + push: + branches: + - master + paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + pull_request_target: + types: [opened, synchronize, reopened] + paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + schedule: + - cron: '2 4 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + +jobs: + server: + runs-on: ubuntu-latest + + strategy: + matrix: + # TODO: temporary disabled due to linux kernel issues + #sanitizer: [ADDRESS, THREAD, UNDEFINED] + sanitizer: [UNDEFINED] + build_type: [Debug] + include: + - build_type: Release + sanitizer: "" + fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken + + container: + image: ubuntu:latest + ports: + - 8888 + options: --cpus 4 + + steps: + - name: Dependencies + id: depends + run: | + apt-get update + apt-get -y install \ + build-essential \ + xxd \ + git \ + cmake \ + python3-pip \ + curl \ + wget \ + language-pack-en \ + libcurl4-openssl-dev + + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Verify server deps + id: verify_server_deps + run: | + git config --global --add safe.directory $(realpath .) + cd examples/server + git ls-files --others --modified + git status + ./deps.sh + git status + not_ignored_files="$(git ls-files --others --modified)" + echo "Modified files: ${not_ignored_files}" + if [ -n "${not_ignored_files}" ]; then + echo "Repository is dirty or server deps are not built as expected" + echo "${not_ignored_files}" + exit 1 + fi + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. \ + -DLLAMA_NATIVE=OFF \ + -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; + cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server + + - name: Tests dependencies + id: test_dependencies + run: | + pip install -r examples/server/tests/requirements.txt + + - name: Tests + id: server_integration_tests + if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} + run: | + cd examples/server/tests + PORT=8888 ./tests.sh + + - name: Slow tests + id: server_integration_tests_slow + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} + run: | + cd examples/server/tests + PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow + + + server-windows: + runs-on: windows-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: libCURL + id: get_libcurl + env: + CURL_VERSION: 8.6.0_6 + run: | + curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip" + mkdir $env:RUNNER_TEMP/libcurl + tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" + cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server + + - name: Python setup + id: setup_python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Tests dependencies + id: test_dependencies + run: | + pip install -r examples/server/tests/requirements.txt + + - name: Copy Libcurl + id: prepare_libcurl + run: | + cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll + + - name: Tests + id: server_integration_tests + if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} + run: | + cd examples/server/tests + behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp + + - name: Slow tests + id: server_integration_tests_slow + if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }} + run: | + cd examples/server/tests + behave.exe --stop --no-skipped --no-capture --tags slow diff --git a/.github/workflows/tidy-post.yml b/.github/workflows/tidy-post.yml deleted file mode 100644 index 03652760c..000000000 --- a/.github/workflows/tidy-post.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: clang-tidy review post comments - -on: - workflow_dispatch: - workflows: ["clang-tidy-review"] - types: - - completed - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: ZedThree/clang-tidy-review/post@v0.13.0 - # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup - with: - # adjust options as necessary - lgtm_comment_body: '' - annotations: false - max_comments: 25 diff --git a/.github/workflows/tidy-review.yml b/.github/workflows/tidy-review.yml deleted file mode 100644 index a4bc8d976..000000000 --- a/.github/workflows/tidy-review.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: clang-tidy-review - -on: - pull_request: - branches: - - master - -jobs: - clang-tidy-review: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - uses: ZedThree/clang-tidy-review@v0.13.0 - id: review - with: - lgtm_comment_body: '' - build_dir: build - cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on - split_workflow: true - - - uses: ZedThree/clang-tidy-review/upload@v0.13.0 diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml index 68a698ab9..747c35cc0 100644 --- a/.github/workflows/zig-build.yml +++ b/.github/workflows/zig-build.yml @@ -6,6 +6,10 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: build: strategy: @@ -14,7 +18,7 @@ jobs: runs-on: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.runs-on }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: recursive fetch-depth: 0 diff --git a/.gitignore b/.gitignore index cf1b692e9..9fb5b80c3 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,10 @@ *.gcda *.dot *.bat +*.tmp *.metallib +*.etag +*.lastModified .DS_Store .build/ .cache/ @@ -23,11 +26,15 @@ .clang-tidy .vs/ .vscode/ +.idea/ + +ggml-metal-embed.metal lcov-report/ gcovr-report/ -build*/ +build* +cmake-build-* out/ tmp/ @@ -43,12 +50,18 @@ models-mnt /embedding /gguf /gguf-llama-simple +/gguf-split +/gritlm +/imatrix /infill /libllama.so /llama-bench /llava-cli /lookahead /lookup +/lookup-create +/lookup-merge +/lookup-stats /main /metal /passkey @@ -64,6 +77,7 @@ models-mnt /batched-bench /export-lora /finetune +/retrieval /speculative /parallel /train-text-from-scratch @@ -88,19 +102,4 @@ examples/jeopardy/results.txt poetry.lock poetry.toml - -# Test binaries -/tests/test-grammar-parser -/tests/test-llama-grammar -/tests/test-double-float -/tests/test-grad0 -/tests/test-opt -/tests/test-quantize-fns -/tests/test-quantize-perf -/tests/test-sampling -/tests/test-tokenizer-0-llama -/tests/test-tokenizer-0-falcon -/tests/test-tokenizer-1-llama -/tests/test-tokenizer-1-bpe -/tests/test-rope -/tests/test-backend-ops +nppBackup diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..b7e8b8ff2 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "kompute"] + path = kompute + url = https://github.com/nomic-ai/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 668669c6d..19fdfa46c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,6 @@ -cmake_minimum_required(VERSION 3.13) # for add_link_options +cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. project("llama.cpp" C CXX) +include(CheckIncludeFileCXX) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -47,12 +48,16 @@ option(BUILD_SHARED_LIBS "build shared libraries" option(LLAMA_STATIC "llama: static link libraries" OFF) option(LLAMA_NATIVE "llama: enable -march=native flag" ON) option(LLAMA_LTO "llama: enable link time optimization" OFF) +option(LLAMA_CCACHE "llama: use ccache if available" ON) # debug option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) option(LLAMA_GPROF "llama: enable gprof" OFF) +# build +option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF) + # sanitizers option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) @@ -76,12 +81,16 @@ if (NOT MSVC) option(LLAMA_F16C "llama: enable F16C" ${INS_ENB}) endif() +if (WIN32) + set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version") +endif() + # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -option(LLAMA_CUBLAS "llama: use CUDA" OFF) -#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) +option(LLAMA_CUDA "llama: use CUDA" OFF) +option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") @@ -90,19 +99,39 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") +option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) +option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) +option(LLAMA_VULKAN "llama: use Vulkan" OFF) +option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF) +option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF) +option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF) +option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF) option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT}) option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) +option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF) +set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING + "llama: metal minimum macOS version") +set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") +option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) +option(LLAMA_SYCL "llama: use SYCL" OFF) +option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) +set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") +option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF) +set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism") option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ON) +# add perf arguments +option(LLAMA_PERF "llama: enable perf" OFF) + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) @@ -110,14 +139,22 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) # Compile flags # -set(CMAKE_CXX_STANDARD 11) +if (LLAMA_SYCL) + set(CMAKE_CXX_STANDARD 17) +else() + set(CMAKE_CXX_STANDARD 11) +endif() + set(CMAKE_CXX_STANDARD_REQUIRED true) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) include(CheckCXXCompilerFlag) +add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES}) + # enable libstdc++ assertions for debug builds if (CMAKE_SYSTEM_NAME MATCHES "Linux") add_compile_definitions($<$:_GLIBCXX_ASSERTIONS>) @@ -126,17 +163,17 @@ endif() if (NOT MSVC) if (LLAMA_SANITIZE_THREAD) add_compile_options(-fsanitize=thread) - link_libraries(-fsanitize=thread) + link_libraries (-fsanitize=thread) endif() if (LLAMA_SANITIZE_ADDRESS) add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries(-fsanitize=address) + link_libraries (-fsanitize=address) endif() if (LLAMA_SANITIZE_UNDEFINED) add_compile_options(-fsanitize=undefined) - link_libraries(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) endif() endif() @@ -168,40 +205,80 @@ if (LLAMA_METAL) add_compile_definitions(GGML_METAL_NDEBUG) endif() - # get full path to the file - #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") - - # copy ggml-metal.metal to bin directory + # copy ggml-common.h and ggml-metal.metal to bin directory + configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) - if (LLAMA_METAL_SHADER_DEBUG) - # custom command to do the following: - # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air - # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib - # - # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works - # disabling fast math is needed in order to pass tests/test-backend-ops - # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 - # note: unfortunately, we have to call it default.metallib instead of ggml.metallib - # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 - set(XC_FLAGS -fno-fast-math -fno-inline -g) - if (LLAMA_QKK_64) - set(XC_FLAGS ${XC_FLAGS} -DQK_K=64) + if (LLAMA_METAL_EMBED_LIBRARY) + enable_language(ASM) + add_compile_definitions(GGML_METAL_EMBED_LIBRARY) + + set(METALLIB_COMMON "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h") + set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") + + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") + + # merge ggml-common.h and ggml-metal.metal into a single file + set(METALLIB_EMBED_ASM "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.s") + set(METALLIB_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-embed.metal") + + add_custom_command( + OUTPUT ${METALLIB_EMBED_ASM} + COMMAND echo "Embedding Metal library" + COMMAND sed -e '/\#include \"ggml-common.h\"/r ${METALLIB_COMMON}' -e '/\#include \"ggml-common.h\"/d' < ${METALLIB_SOURCE} > ${METALLIB_SOURCE_EMBED} + COMMAND echo ".section __DATA,__ggml_metallib" > ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_start" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_start:" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".incbin \\\"${METALLIB_SOURCE_EMBED}\\\"" >> ${METALLIB_EMBED_ASM} + COMMAND echo ".globl _ggml_metallib_end" >> ${METALLIB_EMBED_ASM} + COMMAND echo "_ggml_metallib_end:" >> ${METALLIB_EMBED_ASM} + DEPENDS ggml-metal.metal ggml-common.h + COMMENT "Generate assembly for embedded Metal library" + ) + + set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM}) + else() + if (LLAMA_METAL_SHADER_DEBUG) + # custom command to do the following: + # xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air + # xcrun -sdk macosx metallib ggml-metal.air -o default.metallib + # + # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works + # disabling fast math is needed in order to pass tests/test-backend-ops + # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1 + # note: unfortunately, we have to call it default.metallib instead of ggml.metallib + # ref: https://github.com/ggerganov/whisper.cpp/issues/1720 + set(XC_FLAGS -fno-fast-math -fno-inline -g) + else() + set(XC_FLAGS -O3) + endif() + + # Append macOS metal versioning flags + if (LLAMA_METAL_MACOSX_VERSION_MIN) + message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation") + list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN}) + endif() + if (LLAMA_METAL_STD) + message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation") + list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD}) endif() add_custom_command( OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air COMMAND xcrun -sdk macosx metallib ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - DEPENDS ggml-metal.metal + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h + COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal + DEPENDS ggml-metal.metal ggml-common.h COMMENT "Compiling Metal kernels" - ) + ) add_custom_target( ggml-metal ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib - ) - endif() + ) + endif() # LLAMA_METAL_EMBED_LIBRARY set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${FOUNDATION_LIBRARY} @@ -273,14 +350,17 @@ if (LLAMA_BLAS) endif() message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") + add_compile_options(${BLAS_LINKER_FLAGS}) + add_compile_definitions(GGML_USE_OPENBLAS) + if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) add_compile_definitions(GGML_BLAS_USE_MKL) endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) - set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) + set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) else() message(WARNING "BLAS not found, please refer to " "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" @@ -293,21 +373,25 @@ if (LLAMA_QKK_64) endif() if (LLAMA_CUBLAS) + message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") + set(LLAMA_CUDA ON) +endif() + +if (LLAMA_CUDA) cmake_minimum_required(VERSION 3.17) find_package(CUDAToolkit) if (CUDAToolkit_FOUND) - message(STATUS "cuBLAS found") + message(STATUS "CUDA found") enable_language(CUDA) set(GGML_HEADERS_CUDA ggml-cuda.h) - set(GGML_SOURCES_CUDA ggml-cuda.cu) - add_compile_definitions(GGML_USE_CUBLAS) -# if (LLAMA_CUDA_CUBLAS) -# add_compile_definitions(GGML_CUDA_CUBLAS) -# endif() + file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") + + add_compile_definitions(GGML_USE_CUDA) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() @@ -324,6 +408,9 @@ if (LLAMA_CUBLAS) endif() add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) + if (LLAMA_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() if (LLAMA_STATIC) if (WIN32) @@ -353,7 +440,7 @@ if (LLAMA_CUBLAS) message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") else() - message(WARNING "cuBLAS not found") + message(WARNING "CUDA not found") endif() endif() @@ -362,15 +449,20 @@ if (LLAMA_MPI) find_package(MPI) if (MPI_C_FOUND) message(STATUS "MPI found") + set(GGML_HEADERS_MPI ggml-mpi.h) - set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h) + set(GGML_SOURCES_MPI ggml-mpi.c) + add_compile_definitions(GGML_USE_MPI) add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS}) + if (NOT MSVC) add_compile_options(-Wno-cast-qual) endif() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS}) + # Even if you're only using the C header, C++ programs may bring in MPI # C++ functions, so more linkage is needed if (MPI_CXX_FOUND) @@ -397,49 +489,301 @@ if (LLAMA_CLBLAST) endif() endif() +if (LLAMA_VULKAN) + find_package(Vulkan) + if (Vulkan_FOUND) + message(STATUS "Vulkan found") + + set(GGML_HEADERS_VULKAN ggml-vulkan.h) + set(GGML_SOURCES_VULKAN ggml-vulkan.cpp) + + add_compile_definitions(GGML_USE_VULKAN) + + if (LLAMA_VULKAN_CHECK_RESULTS) + add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) + endif() + + if (LLAMA_VULKAN_DEBUG) + add_compile_definitions(GGML_VULKAN_DEBUG) + endif() + + if (LLAMA_VULKAN_VALIDATE) + add_compile_definitions(GGML_VULKAN_VALIDATE) + endif() + + if (LLAMA_VULKAN_RUN_TESTS) + add_compile_definitions(GGML_VULKAN_RUN_TESTS) + endif() + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan) + else() + message(WARNING "Vulkan not found") + endif() +endif() + if (LLAMA_HIPBLAS) list(APPEND CMAKE_PREFIX_PATH /opt/rocm) if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") endif() + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") endif() - find_package(hip) - find_package(hipblas) - find_package(rocblas) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) - if (${hipblas_FOUND} AND ${hip_FOUND}) - message(STATUS "HIP and hipBLAS found") - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) - if (LLAMA_HIP_UMA) - add_compile_definitions(GGML_HIP_UMA) - endif() - add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h) - if (BUILD_SHARED_LIBS) - set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON) - endif() - if (LLAMA_CUDA_FORCE_DMMV) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV) - endif() - if (LLAMA_CUDA_FORCE_MMQ) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ) - endif() - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) - target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) - target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) - target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas) + message(STATUS "HIP and hipBLAS found") - if (LLAMA_STATIC) - message(FATAL_ERROR "Static linking not supported for HIP/ROCm") - endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm) - else() - message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm") + set(GGML_HEADERS_ROCM ggml-cuda.h) + + file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") + + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) + + if (LLAMA_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) endif() + + if (LLAMA_CUDA_FORCE_DMMV) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) + endif() + + if (LLAMA_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) + endif() + + if (LLAMA_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() + + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) + add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) + add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) + + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) + + if (LLAMA_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") + endif() + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas) +endif() + +if (LLAMA_SYCL) + if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") + message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") + endif() + + if ( NOT DEFINED ENV{ONEAPI_ROOT}) + message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") + endif() + #todo: AOT + + find_package(IntelSYCL REQUIRED) + + message(STATUS "SYCL found") + + add_compile_definitions(GGML_USE_SYCL) + + if (LLAMA_SYCL_F16) + add_compile_definitions(GGML_SYCL_F16) + endif() + + add_compile_options(-I./) #include DPCT + add_compile_options(-I/${SYCL_INCLUDE_DIR}) + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") + if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") + endif() + + set(GGML_HEADERS_SYCL ggml-sycl.h) + set(GGML_SOURCES_SYCL ggml-sycl.cpp) + + if (WIN32) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib) + else() + if (LLAMA_SYCL_TARGET STREQUAL "INTEL") + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl) + endif() + endif() +endif() + +if (LLAMA_KOMPUTE) + add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) + find_package(Vulkan COMPONENTS glslc REQUIRED) + find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) + if (NOT glslc_executable) + message(FATAL_ERROR "glslc not found") + endif() + + function(compile_shader) + set(options) + set(oneValueArgs) + set(multiValueArgs SOURCES) + cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + foreach(source ${compile_shader_SOURCES}) + get_filename_component(filename ${source} NAME) + set(spv_file ${filename}.spv) + add_custom_command( + OUTPUT ${spv_file} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp + ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp + COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} + COMMENT "Compiling ${source} to ${spv_file}" + ) + + get_filename_component(RAW_FILE_NAME ${spv_file} NAME) + set(FILE_NAME "shader${RAW_FILE_NAME}") + string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) + string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) + string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") + set(OUTPUT_HEADER_FILE "${HEADER_FILE}") + message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") + if(CMAKE_GENERATOR MATCHES "Visual Studio") + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" + ) + else() + add_custom_command( + OUTPUT ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} + COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} + DEPENDS ${spv_file} xxd + COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" + ) + endif() + endforeach() + endfunction() + + if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") + message(STATUS "Kompute found") + set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") + add_subdirectory(kompute) + + # Compile our shaders + compile_shader(SOURCES + kompute-shaders/op_scale.comp + kompute-shaders/op_scale_8.comp + kompute-shaders/op_add.comp + kompute-shaders/op_addrow.comp + kompute-shaders/op_mul.comp + kompute-shaders/op_silu.comp + kompute-shaders/op_relu.comp + kompute-shaders/op_gelu.comp + kompute-shaders/op_softmax.comp + kompute-shaders/op_norm.comp + kompute-shaders/op_rmsnorm.comp + kompute-shaders/op_diagmask.comp + kompute-shaders/op_mul_mat_mat_f32.comp + kompute-shaders/op_mul_mat_f16.comp + kompute-shaders/op_mul_mat_q8_0.comp + kompute-shaders/op_mul_mat_q4_0.comp + kompute-shaders/op_mul_mat_q4_1.comp + kompute-shaders/op_mul_mat_q6_k.comp + kompute-shaders/op_getrows_f16.comp + kompute-shaders/op_getrows_q4_0.comp + kompute-shaders/op_getrows_q4_1.comp + kompute-shaders/op_getrows_q6_k.comp + kompute-shaders/op_rope_f16.comp + kompute-shaders/op_rope_f32.comp + kompute-shaders/op_cpy_f16_f16.comp + kompute-shaders/op_cpy_f16_f32.comp + kompute-shaders/op_cpy_f32_f16.comp + kompute-shaders/op_cpy_f32_f32.comp + ) + + # Create a custom target for our generated shaders + add_custom_target(generated_shaders DEPENDS + shaderop_scale.h + shaderop_scale_8.h + shaderop_add.h + shaderop_addrow.h + shaderop_mul.h + shaderop_silu.h + shaderop_relu.h + shaderop_gelu.h + shaderop_softmax.h + shaderop_norm.h + shaderop_rmsnorm.h + shaderop_diagmask.h + shaderop_mul_mat_mat_f32.h + shaderop_mul_mat_f16.h + shaderop_mul_mat_q8_0.h + shaderop_mul_mat_q4_0.h + shaderop_mul_mat_q4_1.h + shaderop_mul_mat_q6_k.h + shaderop_getrows_f16.h + shaderop_getrows_q4_0.h + shaderop_getrows_q4_1.h + shaderop_getrows_q6_k.h + shaderop_rope_f16.h + shaderop_rope_f32.h + shaderop_cpy_f16_f16.h + shaderop_cpy_f16_f32.h + shaderop_cpy_f32_f16.h + shaderop_cpy_f32_f32.h + ) + + # Create a custom command that depends on the generated_shaders + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp + DEPENDS generated_shaders + COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" + ) + + # Add the stamp to the main sources to ensure dependency tracking + set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) + + add_compile_definitions(GGML_USE_KOMPUTE) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) + set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR}) + else() + message(WARNING "Kompute not found") + endif() +endif() + +if (LLAMA_CPU_HBM) + find_library(memkind memkind REQUIRED) + + add_compile_definitions(GGML_USE_CPU_HBM) + + target_link_libraries(ggml PUBLIC memkind) +endif() + +if (LLAMA_PERF) + add_compile_definitions(GGML_PERF) endif() function(get_flags CCID CCVER) @@ -454,17 +798,17 @@ function(get_flags CCID CCVER) (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) ) - set(C_FLAGS ${C_FLAGS} -Wdouble-promotion) + list(APPEND C_FLAGS -Wdouble-promotion) endif() elseif (CCID STREQUAL "GNU") set(C_FLAGS -Wdouble-promotion) set(CXX_FLAGS -Wno-array-bounds) if (CCVER VERSION_GREATER_EQUAL 7.1.0) - set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation) + list(APPEND CXX_FLAGS -Wno-format-truncation) endif() if (CCVER VERSION_GREATER_EQUAL 8.1.0) - set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi) + list(APPEND CXX_FLAGS -Wextra-semi) endif() endif() @@ -472,15 +816,24 @@ function(get_flags CCID CCVER) set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) endfunction() +if (LLAMA_FATAL_WARNINGS) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND C_FLAGS -Werror) + list(APPEND CXX_FLAGS -Werror) + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + add_compile_options(/WX) + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) - set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) - set(C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes - -Werror=implicit-int -Werror=implicit-function-declaration) - set(CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) - set(C_FLAGS ${WARNING_FLAGS} ${C_FLAGS}) - set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS}) + list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND CXX_FLAGS ${WARNING_FLAGS}) get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) @@ -493,16 +846,19 @@ if (LLAMA_ALL_WARNINGS) endif() endif() -if (LLAMA_CUBLAS) - set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math) - if (NOT MSVC) - set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic) +set(CUDA_CXX_FLAGS "") + +if (LLAMA_CUDA) + set(CUDA_FLAGS -use_fast_math) + + if (LLAMA_FATAL_WARNINGS) + list(APPEND CUDA_FLAGS -Werror all-warnings) endif() if (LLAMA_ALL_WARNINGS AND NOT MSVC) set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c) if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "") - set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER}) + list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER}) endif() execute_process( @@ -530,13 +886,12 @@ if (LLAMA_CUBLAS) message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}") get_flags(${CUDA_CCID} ${CUDA_CCVER}) - list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument - if (NOT CUDA_CXX_FLAGS STREQUAL "") - set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS}) - endif() + list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later endif() - add_compile_options("$<$:${CUDA_FLAGS}>") + if (NOT MSVC) + list(APPEND CUDA_CXX_FLAGS -Wno-pedantic) + endif() endif() if (WIN32) @@ -557,12 +912,24 @@ if (LLAMA_LTO) endif() endif() +if (LLAMA_CCACHE) + find_program(LLAMA_CCACHE_FOUND ccache) + if (LLAMA_CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set(ENV{CCACHE_SLOPPINESS} time_macros) + message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.") + else() + message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF") + endif () +endif() + # this version of Apple ld64 is buggy execute_process( COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v ERROR_VARIABLE output OUTPUT_QUIET ) + if (output MATCHES "dyld-1015\.7") add_compile_definitions(HAVE_BUGGY_APPLE_LINKER) endif() @@ -572,10 +939,10 @@ endif() # feel free to update the Makefile for your architecture and send a pull request or issue message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") if (MSVC) - string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) - message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") + string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR) + message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}") else () - set(CMAKE_GENERATOR_PLATFORM_LWR "") + set(CMAKE_GENERATOR_PLATFORM_LWR "") endif () if (NOT MSVC) @@ -590,33 +957,55 @@ if (NOT MSVC) endif() endif() -if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64")) +set(ARCH_FLAGS "") + +if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) message(STATUS "ARM detected") if (MSVC) + add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead add_compile_definitions(__ARM_NEON) add_compile_definitions(__ARM_FEATURE_FMA) - add_compile_definitions(__ARM_FEATURE_DOTPROD) - # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16 - add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead + + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + add_compile_definitions(__ARM_FEATURE_DOTPROD) + endif () + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + endif () + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) else() check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - add_compile_options(-mfp16-format=ieee) + list(APPEND ARCH_FLAGS -mfp16-format=ieee) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") # Raspberry Pi 1, Zero - add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access) + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - # Raspberry Pi 2 - add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a # Raspberry Pi 3, 4, Zero 2 (32-bit) - add_compile_options(-mno-unaligned-access) + list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" ) +elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) message(STATUS "x86 detected") if (MSVC) # instruction set detection for MSVC only @@ -624,8 +1013,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE include(cmake/FindSIMD.cmake) endif () if (LLAMA_AVX512) - add_compile_options($<$:/arch:AVX512>) - add_compile_options($<$:/arch:AVX512>) + list(APPEND ARCH_FLAGS /arch:AVX512) # MSVC has no compile-time flags enabling specific # AVX512 extensions, neither it defines the # macros corresponding to the extensions. @@ -639,54 +1027,64 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE add_compile_definitions($<$:__AVX512VNNI__>) endif() elseif (LLAMA_AVX2) - add_compile_options($<$:/arch:AVX2>) - add_compile_options($<$:/arch:AVX2>) + list(APPEND ARCH_FLAGS /arch:AVX2) elseif (LLAMA_AVX) - add_compile_options($<$:/arch:AVX>) - add_compile_options($<$:/arch:AVX>) + list(APPEND ARCH_FLAGS /arch:AVX) endif() else() if (LLAMA_NATIVE) - add_compile_options(-march=native) + list(APPEND ARCH_FLAGS -march=native) endif() if (LLAMA_F16C) - add_compile_options(-mf16c) + list(APPEND ARCH_FLAGS -mf16c) endif() if (LLAMA_FMA) - add_compile_options(-mfma) + list(APPEND ARCH_FLAGS -mfma) endif() if (LLAMA_AVX) - add_compile_options(-mavx) + list(APPEND ARCH_FLAGS -mavx) endif() if (LLAMA_AVX2) - add_compile_options(-mavx2) + list(APPEND ARCH_FLAGS -mavx2) endif() if (LLAMA_AVX512) - add_compile_options(-mavx512f) - add_compile_options(-mavx512bw) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512bw) endif() if (LLAMA_AVX512_VBMI) - add_compile_options(-mavx512vbmi) + list(APPEND ARCH_FLAGS -mavx512vbmi) endif() if (LLAMA_AVX512_VNNI) - add_compile_options(-mavx512vnni) + list(APPEND ARCH_FLAGS -mavx512vnni) endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") message(STATUS "PowerPC detected") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - add_compile_options(-mcpu=powerpc64le) + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) else() - add_compile_options(-mcpu=native -mtune=native) + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) endif() else() message(STATUS "Unknown architecture") endif() +add_compile_options("$<$:${ARCH_FLAGS}>") +add_compile_options("$<$:${ARCH_FLAGS}>") + +if (LLAMA_CUDA) + list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) + list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument + if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") + list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED}) + endif() + add_compile_options("$<$:${CUDA_FLAGS}>") +endif() + if (MINGW) # Target Windows 8 for PrefetchVirtualMemory - add_compile_definitions(_WIN32_WINNT=0x602) + add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER}) endif() # @@ -744,11 +1142,6 @@ endif() # ggml -if (GGML_USE_CPU_HBM) - add_definitions(-DGGML_USE_CPU_HBM) - find_library(memkind memkind REQUIRED) -endif() - add_library(ggml OBJECT ggml.c ggml.h @@ -758,21 +1151,24 @@ add_library(ggml OBJECT ggml-backend.h ggml-quants.c ggml-quants.h - ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} - ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} - ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} - ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} + ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} + ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} + ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} + ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} + ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) -target_compile_features(ggml PUBLIC c_std_11) # don't bump +target_compile_features (ggml PUBLIC c_std_11) # don't bump + target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) -if (GGML_USE_CPU_HBM) - target_link_libraries(ggml PUBLIC memkind) -endif() add_library(ggml_static STATIC $) + if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(ggml_shared SHARED $) @@ -785,10 +1181,14 @@ endif() add_library(llama llama.cpp llama.h + unicode.h + unicode.cpp + unicode-data.cpp ) target_include_directories(llama PUBLIC .) -target_compile_features(llama PUBLIC cxx_std_11) # don't bump +target_compile_features (llama PUBLIC cxx_std_11) # don't bump + target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS} @@ -838,8 +1238,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama) -set(GGML_PUBLIC_HEADERS "ggml.h" - "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" +set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" + "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") @@ -879,6 +1279,12 @@ if (LLAMA_METAL) GROUP_READ WORLD_READ DESTINATION ${CMAKE_INSTALL_BINDIR}) + if (NOT LLAMA_METAL_EMBED_LIBRARY) + install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + endif() endif() # diff --git a/Makefile b/Makefile index 4c7e175bf..bdd5ef335 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,16 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ - main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ - speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o + main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ + simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \ + retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ - tests/test-backend-ops + tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \ + tests/test-json-schema-to-grammar # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -43,10 +44,6 @@ ifeq ($(UNAME_S),Darwin) endif endif -ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' -BUILD_TARGETS += metal -endif - default: $(BUILD_TARGETS) test: $(TEST_TARGETS) @@ -101,9 +98,10 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -I. -Icommon -MK_CFLAGS = -std=c11 -fPIC -MK_CXXFLAGS = -std=c++11 -fPIC +MK_CPPFLAGS = -I. -Icommon +MK_CFLAGS = -std=c11 -fPIC +MK_CXXFLAGS = -std=c++11 -fPIC +MK_NVCCFLAGS = -std=c++11 # -Ofast tends to produce faster code, but may not be available for some compilers. ifdef LLAMA_FAST @@ -113,8 +111,21 @@ MK_NVCCFLAGS += -O3 else MK_CFLAGS += -O3 MK_CXXFLAGS += -O3 +MK_NVCCFLAGS += -O3 endif +ifndef LLAMA_NO_CCACHE +CCACHE := $(shell which ccache) +ifdef CCACHE +export CCACHE_SLOPPINESS = time_macros +$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.) +CC := $(CCACHE) $(CC) +CXX := $(CCACHE) $(CXX) +else +$(info I ccache not found. Consider installing it for faster compilation.) +endif # CCACHE +endif # LLAMA_NO_CCACHE + # clock_gettime came in POSIX.1b (1993) # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional # posix_memalign came in POSIX.1-2001 / SUSv3 @@ -157,13 +168,17 @@ ifeq ($(UNAME_S),OpenBSD) MK_CPPFLAGS += -D_BSD_SOURCE endif +ifdef LLAMA_SCHED_MAX_COPIES + MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES) +endif + ifdef LLAMA_DEBUG MK_CFLAGS += -O0 -g MK_CXXFLAGS += -O0 -g MK_LDFLAGS += -g ifeq ($(UNAME_S),Linux) - MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS + MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS endif else MK_CPPFLAGS += -DNDEBUG @@ -191,6 +206,10 @@ ifdef LLAMA_SERVER_VERBOSE MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif +ifdef LLAMA_SERVER_SSL + MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT + MK_LDFLAGS += -lssl -lcrypto +endif ifdef LLAMA_CODE_COVERAGE MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase '' @@ -206,6 +225,11 @@ MK_CFLAGS += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis -Werror=implicit-function-declaration MK_CXXFLAGS += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn +ifeq ($(LLAMA_FATAL_WARNINGS),1) + MK_CFLAGS += -Werror + MK_CXXFLAGS += -Werror +endif + # this version of Apple ld64 is buggy ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))' MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER @@ -366,10 +390,24 @@ ifdef LLAMA_BLIS endif # LLAMA_BLIS ifdef LLAMA_CUBLAS - MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include - MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib +# LLAMA_CUBLAS is deprecated and will be removed in the future + LLAMA_CUDA := 1 +endif + +ifdef LLAMA_CUDA + ifneq ('', '$(wildcard /opt/cuda)') + CUDA_PATH ?= /opt/cuda + else + CUDA_PATH ?= /usr/local/cuda + endif + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include + MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib OBJS += ggml-cuda.o - MK_NVCCFLAGS = -use_fast_math + OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + MK_NVCCFLAGS += -use_fast_math +ifdef LLAMA_FATAL_WARNINGS + MK_NVCCFLAGS += -Werror all-warnings +endif # LLAMA_FATAL_WARNINGS ifndef JETSON_EOL_MODULE_DETECT MK_NVCCFLAGS += --forward-unknown-to-host-compiler endif # JETSON_EOL_MODULE_DETECT @@ -377,9 +415,9 @@ ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo endif # LLAMA_DEBUG ifdef LLAMA_CUDA_NVCC - NVCC = $(LLAMA_CUDA_NVCC) + NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC) else - NVCC = nvcc + NVCC = $(CCACHE) nvcc endif #LLAMA_CUDA_NVCC ifdef CUDA_DOCKER_ARCH MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) @@ -420,19 +458,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE else MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE -#ifdef LLAMA_CUDA_CUBLAS -# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS -#endif # LLAMA_CUDA_CUBLAS +ifdef LLAMA_CUDA_NO_PEER_COPY + MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY +endif # LLAMA_CUDA_NO_PEER_COPY ifdef LLAMA_CUDA_CCBIN MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + ifdef JETSON_EOL_MODULE_DETECT - $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +define NVCC_COMPILE + $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +endef # NVCC_COMPILE else - $(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +define NVCC_COMPILE + $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +endef # NVCC_COMPILE endif # JETSON_EOL_MODULE_DETECT -endif # LLAMA_CUBLAS + +ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh + $(NVCC_COMPILE) + +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) + $(NVCC_COMPILE) + +endif # LLAMA_CUDA ifdef LLAMA_CLBLAST @@ -452,8 +501,32 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h $(CXX) $(CXXFLAGS) -c $< -o $@ endif # LLAMA_CLBLAST -ifdef LLAMA_HIPBLAS +ifdef LLAMA_VULKAN + MK_CPPFLAGS += -DGGML_USE_VULKAN + MK_LDFLAGS += -lvulkan + OBJS += ggml-vulkan.o +ifdef LLAMA_VULKAN_CHECK_RESULTS + MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS +endif + +ifdef LLAMA_VULKAN_DEBUG + MK_CPPFLAGS += -DGGML_VULKAN_DEBUG +endif + +ifdef LLAMA_VULKAN_VALIDATE + MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE +endif + +ifdef LLAMA_VULKAN_RUN_TESTS + MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS +endif + +ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h + $(CXX) $(CXXFLAGS) -c $< -o $@ +endif # LLAMA_VULKAN + +ifdef LLAMA_HIPBLAS ifeq ($(wildcard /opt/rocm),) ROCM_PATH ?= /usr GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) @@ -461,11 +534,11 @@ ifdef LLAMA_HIPBLAS ROCM_PATH ?= /opt/rocm GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) endif - HIPCC ?= $(ROCM_PATH)/bin/hipcc + HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 - MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS + MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA ifdef LLAMA_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA endif # LLAMA_HIP_UMA @@ -478,9 +551,18 @@ endif # LLAMA_HIP_UMA ifdef LLAMA_CUDA_FORCE_DMMV HIPFLAGS += -DGGML_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV +ifdef LLAMA_CUDA_NO_PEER_COPY + HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY +endif # LLAMA_CUDA_NO_PEER_COPY OBJS += ggml-cuda.o -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< + +ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh + $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< + endif # LLAMA_HIPBLAS ifdef LLAMA_METAL @@ -490,11 +572,30 @@ ifdef LLAMA_METAL ifdef LLAMA_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif +ifdef LLAMA_METAL_EMBED_LIBRARY + MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY + OBJS += ggml-metal-embed.o +endif endif # LLAMA_METAL ifdef LLAMA_METAL -ggml-metal.o: ggml-metal.m ggml-metal.h +ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h $(CC) $(CFLAGS) -c $< -o $@ + +ifdef LLAMA_METAL_EMBED_LIBRARY +ggml-metal-embed.o: ggml-metal.metal ggml-common.h + @echo "Embedding Metal library" + @sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal + $(eval TEMP_ASSEMBLY=$(shell mktemp)) + @echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY) + @echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY) + @echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY) + @echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY) + @$(AS) $(TEMP_ASSEMBLY) -o $@ + @rm -f ${TEMP_ASSEMBLY} +endif endif # LLAMA_METAL ifdef LLAMA_MPI @@ -506,17 +607,23 @@ GF_CC := $(CC) include scripts/get-flags.mk # combine build flags with cmdline overrides -override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS) -BASE_CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS) -override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) +override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) +override CFLAGS := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS) +BASE_CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS) +override CXXFLAGS := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS) override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) # identify CUDA host compiler -ifdef LLAMA_CUBLAS +ifdef LLAMA_CUDA GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler include scripts/get-flags.mk -CUDA_CXXFLAGS := $(GF_CXXFLAGS) +CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic +endif + +ifdef LLAMA_CURL +override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL +override LDFLAGS := $(LDFLAGS) -lcurl endif # @@ -531,10 +638,28 @@ $(info I CFLAGS: $(CFLAGS)) $(info I CXXFLAGS: $(CXXFLAGS)) $(info I NVCCFLAGS: $(NVCCFLAGS)) $(info I LDFLAGS: $(LDFLAGS)) -$(info I CC: $(shell $(CC) --version | head -n 1)) -$(info I CXX: $(shell $(CXX) --version | head -n 1)) +$(info I CC: $(shell $(CC) --version | head -n 1)) +$(info I CXX: $(shell $(CXX) --version | head -n 1)) +ifdef LLAMA_CUDA +$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) +CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') +ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) +ifndef CUDA_DOCKER_ARCH +ifndef CUDA_POWER_ARCH +$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH) +endif # CUDA_POWER_ARCH +endif # CUDA_DOCKER_ARCH +endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) +endif # LLAMA_CUDA $(info ) +ifdef LLAMA_CUBLAS +$(info !!!!) +$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.) +$(info !!!!) +$(info ) +endif + # # Build library # @@ -548,12 +673,18 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h +ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ -OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o +unicode.o: unicode.cpp unicode.h + $(CXX) $(CXXFLAGS) -c $< -o $@ -llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h +unicode-data.o: unicode-data.cpp unicode-data.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o + +llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h @@ -571,107 +702,174 @@ console.o: common/console.cpp common/console.h grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h $(CXX) $(CXXFLAGS) -c $< -o $@ +json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + train.o: common/train.cpp common/train.h $(CXX) $(CXXFLAGS) -c $< -o $@ +ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) +libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) + ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) + clean: - rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf ggml-cuda/*.o + find examples pocs -type f -name "*.o" -delete # # Examples # +# $< is the first prerequisite, i.e. the source file. +# Explicitly compile this to an object file so that it can be cached with ccache. +# The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead. + +# Helper function that replaces .c, .cpp, and .cu file endings with .o: +GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1)))) + main: examples/main/main.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo infill: examples/infill/infill.cpp ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual +server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual + $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS) passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -ifdef LLAMA_METAL -metal: examples/metal/metal.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -endif +gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) ifeq ($(UNAME_S),Darwin) swift: examples/batched.swift @@ -679,7 +877,7 @@ swift: examples/batched.swift endif common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh - @sh scripts/build-info.sh $(CC) > $@.tmp + @sh scripts/build-info.sh "$(CC)" > $@.tmp @if ! cmp -s $@.tmp $@; then \ mv $@.tmp $@; \ else \ @@ -696,7 +894,8 @@ build-info.o: common/build-info.cpp tests: $(TEST_TARGETS) benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) run-benchmark-matmult: benchmark-matmult ./$@ @@ -704,52 +903,84 @@ run-benchmark-matmult: benchmark-matmult .PHONY: run-benchmark-matmult swift vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) tests/test-c.o: tests/test-c.c llama.h $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + +tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index 583e2e276..8b7195869 100644 --- a/Package.swift +++ b/Package.swift @@ -13,17 +13,33 @@ let package = Package( products: [ .library(name: "llama", targets: ["llama"]), ], - dependencies: [ - .package(url: "https://github.com/ggerganov/ggml.git", .branch("master")) - ], targets: [ .target( name: "llama", - dependencies: ["ggml"], path: ".", - exclude: ["ggml-metal.metal"], + exclude: [ + "cmake", + "examples", + "scripts", + "models", + "tests", + "CMakeLists.txt", + "ggml-cuda.cu", + "ggml-cuda.h", + "Makefile" + ], sources: [ + "ggml.c", "llama.cpp", + "unicode.cpp", + "unicode-data.cpp", + "ggml-alloc.c", + "ggml-backend.c", + "ggml-quants.c", + "ggml-metal.m", + ], + resources: [ + .process("ggml-metal.metal") ], publicHeadersPath: "spm-headers", cSettings: [ diff --git a/README-sycl.md b/README-sycl.md new file mode 100644 index 000000000..1324f632d --- /dev/null +++ b/README-sycl.md @@ -0,0 +1,597 @@ +# llama.cpp for SYCL + +- [Background](#background) +- [News](#news) +- [OS](#os) +- [Supported Devices](#supported-devices) +- [Docker](#docker) +- [Linux](#linux) +- [Windows](#windows) +- [Environment Variable](#environment-variable) +- [Known Issue](#known-issue) +- [Q&A](#q&a) +- [Todo](#todo) + +## Background + +**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17. + +**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include: + +- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers. +- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*. +- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs. +- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets. + +### Llama.cpp + SYCL +This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose. + +The llama.cpp SYCL backend supports: +- Intel GPUs. +- Nvidia GPUs. + +*Upcoming support: AMD GPUs*. + +When targetting **Intel CPUs**, it is recommended to use llama.cpp for [x86_64](README.md#intel-onemkl) approach. + +## News + +- 2024.3 + - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd). + - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437). + - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing. + - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE. + - Support detecting all GPUs with level-zero and same top **Max compute units**. + - Support OPs + - hardsigmoid + - hardswish + - pool2d + +- 2024.1 + - Create SYCL backend for Intel GPU. + - Support Windows build + +## OS + +|OS|Status|Verified| +|-|-|-| +|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39| +|Windows|Support|Windows 11| + + +## Supported devices + +### Intel GPUs + +The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following: +```sh +source /opt/intel/oneapi/setvars.sh +``` + +- **Tested devices** + +|Intel GPU| Status | Verified Model| +|-|-|-| +|Intel Data Center Max Series| Support| Max 1550| +|Intel Data Center Flex Series| Support| Flex 170| +|Intel Arc Series| Support| Arc 770, 730M| +|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| +|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7| + +*Notes:* + +- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`. + +- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs. + +- If the iGPU has less than 80 EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use. + +### Nvidia GPUs +The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment) + +- **Tested devices** + +|Nvidia GPU| Status | Verified Model| +|-|-|-| +|Ampere Series| Support| A100, A4000| +|Ampere Series *(Mobile)*| Support| RTX 40 Series| + +*Notes:* + - Support for Nvidia targets through oneAPI is currently limited to Linux platforms. + + - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs. + + +## Docker +The docker build option is currently limited to *intel GPU* targets. +### Build image +```sh +# Using FP16 +docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . +``` + +*Notes*: + +To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command. + +You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative. + +### Run container + +```sh +# First, find all the DRI cards +ls -la /dev/dri +# Then, pick the card that you want to use (here for e.g. /dev/dri/card1). +docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 +``` + +*Notes:* +- Docker has been tested successfully on native Linux. WSL support has not been verified yet. +- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*. + +## Linux + +### I. Setup Environment + +1. **Install GPU drivers** + + - **Intel GPU** + +Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps). + +*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html). + +Once installed, add the user(s) to the `video` and `render` groups. + +```sh +sudo usermod -aG render $USER +sudo usermod -aG video $USER +``` + +*Note*: logout/re-login for the changes to take effect. + +Verify installation through `clinfo`: + +```sh +sudo apt install clinfo +sudo clinfo -l +``` + +Sample output: + +```sh +Platform #0: Intel(R) OpenCL Graphics + `-- Device #0: Intel(R) Arc(TM) A770 Graphics + +Platform #0: Intel(R) OpenCL HD Graphics + `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49] +``` + +- **Nvidia GPU** + +In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed. +Installation can be verified by running the following: +```sh +nvidia-smi +``` +Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*: +``` ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 NVIDIA A100-PCIE-40GB On | 00000000:8D:00.0 Off | 0 | +| N/A 36C P0 57W / 250W | 4MiB / 40960MiB | 0% Default | +| | | Disabled | ++-----------------------------------------+----------------------+----------------------+ +``` + + +2. **Install Intel® oneAPI Base toolkit** + +- **Base installation** + +The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page. + +Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*. + +Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable. + +Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs. + +- **Adding support to Nvidia GPUs** + +**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup. + + +**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs. + +```sh +git clone https://github.com/oneapi-src/oneMKL +cd oneMKL +mkdir -p buildWithCublas && cd buildWithCublas +cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas +make +``` + + +3. **Verify installation and environment** + +In order to check the available SYCL devices on the machine, please use the `sycl-ls` command. +```sh +source /opt/intel/oneapi/setvars.sh +sycl-ls +``` + +- **Intel GPU** + +When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below: + +``` +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] +[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] +[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] +[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] +``` + +- **Nvidia GPU** + +Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow: +``` +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] +[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] +[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2] +``` + +### II. Build llama.cpp + +#### Intel GPU +```sh +# Export relevant ENV variables +source /opt/intel/oneapi/setvars.sh + +# Build LLAMA with MKL BLAS acceleration for intel GPU +mkdir -p build && cd build + +# Option 1: Use FP16 for better performance in long-prompt inference +cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON + +# Option 2: Use FP32 by default +cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +``` + +#### Nvidia GPU +```sh +# Export relevant ENV variables +export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH +export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH +export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR +export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR + +# Build LLAMA with Nvidia BLAS acceleration through SYCL +mkdir -p build && cd build + +# Option 1: Use FP16 for better performance in long-prompt inference +cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON + +# Option 2: Use FP32 by default +cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +``` + +### III. Run the inference + +1. Retrieve and prepare model + +You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. + +2. Enable oneAPI running environment + +```sh +source /opt/intel/oneapi/setvars.sh +``` + +3. List devices information + +Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: + +```sh +./build/bin/ls-sycl-device +``` +A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: +``` +found 6 SYCL devices: +| | | |Compute |Max compute|Max work|Max sub| | +|ID| Device Type| Name|capability|units |group |group |Global mem size| +|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| +| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| +| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| +| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| +| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| +| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| +| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| +``` + +|Attribute|Note| +|-|-| +|compute capability 1.3|Level-zero driver/runtime, recommended | +|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases| + +4. Launch inference + +There are two device selection modes: + +- Single device: Use one device target specified by the user. +- Multiple devices: Automatically select the devices with the same largest Max compute-units. + +|Device selection|Parameter| +|-|-| +|Single device|--split-mode none --main-gpu DEVICE_ID | +|Multiple devices|--split-mode layer (default)| + +Examples: + +- Use device 0: + +```sh +ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 +``` +or run by script: + +```sh +./examples/sycl/run_llama2.sh 0 +``` + +- Use multiple devices: + +```sh +ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer +``` + +Otherwise, you can run the script: + +```sh +./examples/sycl/run_llama2.sh +``` + +*Notes:* + +- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue. +- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: + +```sh +detect 1 SYCL GPUs: [0] with top Max compute units:512 +``` +Or +```sh +use 1 SYCL GPUs: [0] with Max compute units:512 +``` + +## Windows + +### I. Setup Environment + +1. Install GPU driver + +Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html). + +2. Install Visual Studio + +If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/). + +3. Install Intel® oneAPI Base toolkit + +The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page. + +Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*. + +Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable. + +b. Enable oneAPI running environment: + +- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App. + +- On the command prompt, enable the runtime environment with the following: +``` +"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 +``` + +c. Verify installation + +In the oneAPI command line, run the following to print the available SYCL devices: + +``` +sycl-ls +``` + +There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device: + +Output (example): +``` +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] +[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] +[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186] +[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044] +``` + +4. Install build tools + +a. Download & install cmake for Windows: https://cmake.org/download/ + +b. Download & install mingw-w64 make for Windows provided by w64devkit + +- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip). + +- Extract `w64devkit` on your pc. + +- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`). + +### II. Build llama.cpp + +On the oneAPI command line window, step into the llama.cpp main directory and run the following: + +``` +mkdir -p build +cd build +@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force + +cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON + +make +``` + +Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions: +```sh +.\examples\sycl\win-build-sycl.bat +``` + +*Notes:* + +- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`. + +### III. Run the inference + +1. Retrieve and prepare model + +You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. + +2. Enable oneAPI running environment + +On the oneAPI command line window, run the following and step into the llama.cpp directory: +``` +"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 +``` + +3. List devices information + +Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: + +``` +build\bin\ls-sycl-device.exe +``` + +The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following: +``` +found 6 SYCL devices: +| | | |Compute |Max compute|Max work|Max sub| | +|ID| Device Type| Name|capability|units |group |group |Global mem size| +|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| +| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| +| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| +| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| +| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| +| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| +| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| + +``` + +|Attribute|Note| +|-|-| +|compute capability 1.3|Level-zero running time, recommended | +|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| + + +4. Launch inference + +There are two device selection modes: + +- Single device: Use one device assigned by user. +- Multiple devices: Automatically choose the devices with the same biggest Max compute units. + +|Device selection|Parameter| +|-|-| +|Single device|--split-mode none --main-gpu DEVICE_ID | +|Multiple devices|--split-mode layer (default)| + +Examples: + +- Use device 0: + +``` +build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0 +``` + +- Use multiple devices: + +``` +build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer +``` +Otherwise, run the following wrapper script: + +``` +.\examples\sycl\win-run-llama2.bat +``` + +Note: + +- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue. +- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: + +```sh +detect 1 SYCL GPUs: [0] with top Max compute units:512 +``` +Or +```sh +use 1 SYCL GPUs: [0] with Max compute units:512 +``` + +## Environment Variable + +#### Build + +|Name|Value|Function| +|-|-|-| +|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.| +|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.| +|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.| +|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.| +|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.| + +#### Runtime + +|Name|Value|Function| +|-|-|-| +|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| +|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer| + +## Known Issues + +- Hanging during startup + + llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang. + + - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable. + +- `Split-mode:[row]` is not supported. + +## Q&A + +- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. + + - Potential cause: Unavailable oneAPI installation or not set ENV variables. + - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`. + +- General compiler error: + + - Remove build folder or try a clean-build. + +- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux. + + Please double-check with `sudo sycl-ls`. + + If it's present in the list, please add video/render group to your user then **logout/login** or restart your system: + + ``` + sudo usermod -aG render $USER + sudo usermod -aG video $USER + ``` + Otherwise, please double-check the GPU driver installation steps. + +### **GitHub contribution**: +Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay. + +## Todo + +- Support row layer split for multiple card runs. diff --git a/README.md b/README.md index 866aa87b4..bd3f9cff5 100644 --- a/README.md +++ b/README.md @@ -6,16 +6,25 @@ [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) -Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ +Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ + +### Recent API changes + +- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 +- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017 +- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 +- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 +- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 ### Hot topics -- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow -- Collecting Apple Silicon performance stats: - - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167 - - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508 -- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406 -- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 +- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387** +- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404 +- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225 +- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017 +- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 +- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 +- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 ---- @@ -31,17 +40,14 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
  • Get the Code
  • Build
  • BLAS Build
  • -
  • Prepare Data & Run
  • +
  • Prepare and Quantize
  • +
  • Run the quantized model
  • Memory/Disk Requirements
  • Quantization
  • Interactive mode
  • Constrained output with grammars
  • -
  • Instruction mode with Alpaca
  • -
  • Using OpenLLaMA
  • -
  • Using GPT4All
  • -
  • Using Pygmalion 7B & Metharme 7B
  • -
  • Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
  • -
  • Verifying the model files
  • +
  • Instruct mode
  • +
  • Obtaining and using the Facebook LLaMA 2 model
  • Seminal papers and background on the models
  • Perplexity (measuring model quality)
  • Android
  • @@ -56,18 +62,20 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ ## Description -The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook +The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide +variety of hardware - locally and in the cloud. -- Plain C/C++ implementation without dependencies -- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks +- Plain C/C++ implementation without any dependencies +- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks - AVX, AVX2 and AVX512 support for x86 architectures -- Mixed F16 / F32 precision -- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support -- CUDA, Metal and OpenCL GPU backend support +- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use +- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP) +- Vulkan, SYCL, and (partial) OpenCL backend support +- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity -The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022). -Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves -as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. +Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has +improved significantly thanks to many contributions. It is the main playground for developing new features for the +[ggml](https://github.com/ggerganov/ggml) library. **Supported platforms:** @@ -75,44 +83,55 @@ as the main playground for developing new features for the [ggml](https://github - [X] Linux - [X] Windows (via CMake) - [X] Docker +- [X] FreeBSD **Supported models:** +Typically finetunes of the base models below are supported as well. + - [X] LLaMA 🦙 - [x] LLaMA 2 🦙🦙 +- [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) +- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [X] Falcon -- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) -- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) -- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) -- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy) -- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b) -- [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) -- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410) - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) -- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586) +- [X] [StableLM models](https://huggingface.co/stabilityai) - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek) - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen) -- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557) +- [x] [Phi models](https://huggingface.co/models?search=microsoft/phi) - [x] [GPT-2](https://huggingface.co/gpt2) +- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118) +- [x] [InternLM2](https://huggingface.co/models?search=internlm2) +- [x] [CodeShell](https://github.com/WisdomShell/codeshell) +- [x] [Gemma](https://ai.google.dev/gemma) +- [x] [Mamba](https://github.com/state-spaces/mamba) +- [x] [Xverse](https://huggingface.co/models?search=xverse) +- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) +- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) **Multimodal models:** -- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e) -- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava) +- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) +- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava) - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5) - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) +- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) +- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) +**HTTP server** + +[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. **Bindings:** @@ -120,30 +139,57 @@ as the main playground for developing new features for the [ggml](https://github - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp) - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp) +- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) +- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) -- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) +- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) +- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) +- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) +- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) +- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) **UI:** -- [nat/openplayground](https://github.com/nat/openplayground) -- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) -- [withcatai/catai](https://github.com/withcatai/catai) -- [semperai/amica](https://github.com/semperai/amica) -- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) -- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) +Unless otherwise noted these projects are open-source with permissive licensing: + - [iohub/collama](https://github.com/iohub/coLLaMA) +- [janhq/jan](https://github.com/janhq/jan) (AGPL) +- [nat/openplayground](https://github.com/nat/openplayground) +- [Faraday](https://faraday.dev/) (proprietary) +- [LMStudio](https://lmstudio.ai/) (proprietary) +- [LocalAI](https://github.com/mudler/LocalAI) (MIT) +- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) +- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) +- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) +- [ollama/ollama](https://github.com/ollama/ollama) +- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) +- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) +- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) +- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) +- [pythops/tenere](https://github.com/pythops/tenere) (AGPL) +- [RecurseChat](https://recurse.chat/) (proprietary) +- [semperai/amica](https://github.com/semperai/amica) +- [withcatai/catai](https://github.com/withcatai/catai) +- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) +- [Msty](https://msty.app) (proprietary) +- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) +- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later) +- [Dot](https://github.com/alexpinel/Dot) (GPL) +- [MindMac](https://mindmac.app) (proprietary) + +*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* --- Here is a typical run using LLaMA v2 13B on M2 Ultra: -```java +``` $ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e I llama.cpp build info: I UNAME_S: Darwin @@ -227,7 +273,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8 ## Usage -Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model. +Here are the end-to-end binary build and model conversion steps for most supported models. ### Get the Code @@ -288,7 +334,7 @@ In order to build llama.cpp you have three different options. sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \ opencl clblast openblas - gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 + gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 ``` **Notes:** With this packages you can build llama.cpp with OPENBLAS and @@ -388,55 +434,52 @@ Building the program with BLAS support may lead to some performance improvements Check [BLIS.md](docs/BLIS.md) for more information. +- #### SYCL + SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. + + llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). + + For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md). + - #### Intel oneMKL + Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md). + - Using manual oneAPI installation: By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: ```bash mkdir build cd build - source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-runtime docker image, only required for manual installation + source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON cmake --build . --config Release ``` - Using oneAPI docker image: - If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime) - - ```bash - mkdir build - cd build - cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON - cmake --build . --config Release - ``` - - Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. + If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above. Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. -- #### cuBLAS +- #### CUDA - This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). + This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling. - Using `make`: ```bash - make LLAMA_CUBLAS=1 + make LLAMA_CUDA=1 ``` - Using `CMake`: ```bash mkdir build cd build - cmake .. -DLLAMA_CUBLAS=ON + cmake .. -DLLAMA_CUDA=ON cmake --build . --config Release ``` The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: - | Option | Legal values | Default | Description | |--------------------------------|------------------------|---------|-------------| | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | @@ -596,34 +639,87 @@ Building the program with BLAS support may lead to some performance improvements You can get a list of platforms and devices from the `clinfo -l` command, etc. -### Prepare Data & Run +- #### Vulkan + + **With docker**: + + You don't need to install Vulkan SDK. It will be installed inside the container. + + ```sh + # Build the image + docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile . + + # Then, use it: + docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 + ``` + + **Without docker**: + + Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html) + + For example, on Ubuntu 22.04 (jammy), use the command below: + + ```bash + wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + apt update -y + apt-get install -y vulkan-sdk + # To verify the installation, use the command below: + vulkaninfo + ``` + + Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead. + + Then, build llama.cpp using the cmake command below: + + ```bash + mkdir -p build + cd build + cmake .. -DLLAMA_VULKAN=1 + cmake --build . --config Release + # Test the output binary (with "-ngl 33" to offload all layers to GPU) + ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 + + # You should see in the output, ggml_vulkan detected your GPU. For example: + # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 + ``` + +### Prepare and Quantize + +To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. ```bash -# obtain the original LLaMA model weights and place them in ./models +# obtain the official LLaMA model weights and place them in ./models ls ./models -65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model - # [Optional] for models using BPE tokenizers - ls ./models - 65B 30B 13B 7B vocab.json +llama-2-7b tokenizer_checklist.chk tokenizer.model +# [Optional] for models using BPE tokenizers +ls ./models + vocab.json +# [Optional] for PyTorch .bin models like Mistral-7B +ls ./models + # install Python dependencies python3 -m pip install -r requirements.txt -# convert the 7B model to ggml FP16 format -python3 convert.py models/7B/ +# convert the model to ggml FP16 format +python3 convert.py models/mymodel/ - # [Optional] for models using BPE tokenizers - python convert.py models/7B/ --vocabtype bpe +# [Optional] for models using BPE tokenizers +python convert.py models/mymodel/ --vocab-type bpe -# quantize the model to 4-bits (using q4_0 method) -./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0 +# quantize the model to 4-bits (using Q4_K_M method) +./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M -# update the gguf filetype to current if older version is unsupported by another application -./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY +# update the gguf filetype to current version if older version is now unsupported +./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY +``` +### Run the quantized model -# run the inference -./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 +```bash +# start inference on a gguf model +./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128 ``` When running the larger models, make sure you have enough disk space to store all the intermediate files. @@ -644,7 +740,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. -| Model | Original size | Quantized size (4-bit) | +| Model | Original size | Quantized size (Q4_0) | |------:|--------------:|-----------------------:| | 7B | 13 GB | 3.9 GB | | 13B | 24 GB | 7.8 GB | @@ -671,9 +767,21 @@ Several quantization methods are supported. They differ in the resulting model d | 13B | bits/weight | 16.0 | 4.5 | 5.0 | 5.5 | 6.0 | 8.5 | - [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684) -- recent k-quants improvements +- recent k-quants improvements and new i-quants - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707) - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807) + - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773) + - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856) + - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861) + - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872) + - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897) + - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) + - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) + - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) + - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) + - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) + - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) ### Perplexity (measuring model quality) @@ -685,7 +793,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread #### How to run -1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research +1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` 3. Output: ``` @@ -698,7 +806,7 @@ And after 4.45 hours, you will have the final perplexity. ### Interactive mode If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter. -In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. +In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. Here is an example of a few-shot interaction, invoked with the command @@ -748,9 +856,9 @@ The `grammars/` folder contains a handful of sample grammars. To write your own, For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. -### Instruction mode with Alpaca +### Instruct mode -1. First, download the `ggml` Alpaca model into the `./models` folder +1. First, download and place the `ggml` model into the `./models` folder 2. Run the `main` tool like this: ``` @@ -762,7 +870,7 @@ Sample run: ``` == Running in interactive mode. == - Press Ctrl+C to interject at any time. - - Press Return to return control to LLaMa. + - Press Return to return control to LLaMA. - If you want to submit another line, end your input in '\'. Below is an instruction that describes a task. Write a response that appropriately completes the request. @@ -776,50 +884,6 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. > ``` -### Using [OpenLLaMA](https://github.com/openlm-research/open_llama) - -OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights. - -- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face. -- Convert the model to ggml FP16 format using `python convert.py ` - -### Using [GPT4All](https://github.com/nomic-ai/gpt4all) - -*Note: these instructions are likely obsoleted by the GGUF update* - -- Obtain the `tokenizer.model` file from LLaMA model and put it to `models` -- Obtain the `added_tokens.json` file from Alpaca model and put it to `models` -- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B` -- It is distributed in the old `ggml` format which is now obsoleted -- You have to convert it to the new format using `convert.py`: - -```bash -python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin -``` - -- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models - -- The newer GPT4All-J model is not yet supported! - -### Using Pygmalion 7B & Metharme 7B - -- Obtain the [LLaMA weights](#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data) -- Obtain the [Pygmalion 7B](https://huggingface.co/PygmalionAI/pygmalion-7b/) or [Metharme 7B](https://huggingface.co/PygmalionAI/metharme-7b) XOR encoded weights -- Convert the LLaMA model with [the latest HF convert script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py) -- Merge the XOR files with the converted LLaMA weights by running the [xor_codec](https://huggingface.co/PygmalionAI/pygmalion-7b/blob/main/xor_codec.py) script -- Convert to `ggml` format using the `convert.py` script in this repo: -```bash -python3 convert.py pygmalion-7b/ --outtype q4_1 -``` -> The Pygmalion 7B & Metharme 7B weights are saved in [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) precision. If you wish to convert to `ggml` without quantizating, please specify the `--outtype` as `f32` instead of `f16`. - - -### Obtaining the Facebook LLaMA original model and Stanford Alpaca model data - -- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.** -- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. -- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data. - ### Obtaining and using the Facebook LLaMA 2 model - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data. @@ -831,20 +895,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1 - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF) - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF) -### Verifying the model files - -Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files. -- The following python script will verify if you have all possible latest files in your self-installed `./models` subdirectory: - -```bash -# run the verification script -./scripts/verify-checksum-models.py -``` - -- On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory: - - On Linux: `sha256sum --ignore-missing -c SHA256SUMS` - - on macOS: `shasum -a 256 --ignore-missing -c SHA256SUMS` - ### Seminal papers and background on the models If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT: @@ -867,6 +917,9 @@ First, install the essential packages for termux: pkg install clang wget git cmake ``` Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake: + +You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux. + ``` $ mkdir build-android $ cd build-android @@ -875,7 +928,28 @@ $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROI $ make ``` Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card. -Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone: +Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: + +(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) +``` +$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ +$cd /data/data/com.termux/files/home/bin +$chmod +x ./* +``` + +Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` + +``` +$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/ +``` + +Now, you can start chatting: +``` +$cd /data/data/com.termux/files/home/bin +$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml +``` + +Here is a demo of an interactive session running on Pixel 5 phone: https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 @@ -929,17 +1003,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th * Create a folder to store big models & intermediate files (ex. /llama/models) #### Images -We have two Docker images available for this project: +We have three Docker images available for this project: 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) +3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`) Additionally, there the following images, similar to the above: - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) +- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) +- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). @@ -965,6 +1042,12 @@ or with a light image: docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 ``` +or with a server image: + +```bash +docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 +``` + ### Docker With CUDA Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. @@ -974,6 +1057,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia ```bash docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . +docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile . ``` You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. @@ -987,6 +1071,7 @@ The resulting images, are essentially the same as the non-CUDA images: 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. +3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. #### Usage @@ -995,6 +1080,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne ```bash docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 +docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 ``` ### Contributing diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..14504b1bf --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,67 @@ +# Security Policy + + - [**Using llama.cpp securely**](#using-llamacpp-securely) + - [Untrusted models](#untrusted-models) + - [Untrusted inputs](#untrusted-inputs) + - [Data privacy](#data-privacy) + - [Untrusted environments or networks](#untrusted-environments-or-networks) + - [Multi-Tenant environments](#multi-tenant-environments) + - [**Reporting a vulnerability**](#reporting-a-vulnerability) + +## Using llama.cpp securely + +### Untrusted models +Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources. + +*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. + +> [!NOTE] +> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance. + +### Untrusted inputs + +Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks. + +For maximum security when handling untrusted inputs, you may need to employ the following: + +* Sandboxing: Isolate the environment where the inference happens. +* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics. +* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches. +* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as: + * Validation: Enforce strict rules on allowed characters and data types. + * Filtering: Remove potentially malicious scripts or code fragments. + * Encoding: Convert special characters into safe representations. + * Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)). + +### Data privacy + +To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors. + +### Untrusted environments or networks + +If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions: +* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value +* Encrypt your data if sending it over the network. + +### Multi-Tenant environments + +If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks. + +1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity. + +1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring. + +1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk. + +1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time. + +## Reporting a vulnerability + +Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++. + + +However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. + +Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new). + +A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. diff --git a/SHA256SUMS b/SHA256SUMS deleted file mode 100644 index ca4d5a4a5..000000000 --- a/SHA256SUMS +++ /dev/null @@ -1,40 +0,0 @@ -700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth -666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847 models/7B/ggml-model-f16.bin -ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf models/7B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/7B/ggml-model-q5_1.bin -7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json -745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth -d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth -2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808 models/13B/ggml-model-f16.bin -fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5 models/13B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/13B/ggml-model-q5_1.bin -4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json -e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth -4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth -24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth -1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth -7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37 models/30B/ggml-model-f16.bin -d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d models/30B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/30B/ggml-model-q5_1.bin -2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json -135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth -9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth -e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/consolidated.02.pth -73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e models/65B/consolidated.03.pth -882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225 models/65B/consolidated.04.pth -a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth -72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth -d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth -60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0 models/65B/ggml-model-f16.bin -cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92 models/65B/ggml-model-q4_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q4_1.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_0.bin -ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff models/65B/ggml-model-q5_1.bin -999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json -9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 models/tokenizer.model diff --git a/awq-py/README.md b/awq-py/README.md deleted file mode 100644 index 59354f4e3..000000000 --- a/awq-py/README.md +++ /dev/null @@ -1,116 +0,0 @@ -# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp -[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)] - -**Supported models:** - -- [X] LLaMA -- [x] LLaMA 2 -- [X] MPT -- [X] Mistral AI v0.1 -- [ ] Bloom -- [ ] Mixtral MoE - -**TODO:** -- [x] Update version work with both MPT and MPT-AWQ model -- [ ] Add OPT model -- [ ] Add Bloom model -- [ ] Add Mixtral MoE -- [ ] Support w3, w2 - - -## Contents - -- [Install](##Install) -- [Convert](##Convert) -- [Quantize](##Quantize) -- [Test](##Test) -- [Benchmark](##Benchmark) -- [Results](##Results) - -## Install -Install requirements -```bash -pip install -r requirements.txt -``` -Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT -```bash -git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache -``` - -## Convert -Example for llama model -```bash -# For llama7b and llama2 models -python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf -# For mistral and mpt models -python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf -``` - -## Quantize -```bash -# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types. -./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0 -``` - -## Test -```bash -# For all models. -./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time" -``` - -## Benchmark -The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512. -```bash -# For llama and llama2, and mistral models. -./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw -``` - -## Results -Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison -We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k - -### Llama 7B (Build with OpenBLAS) - -| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | -|-----------:|--------------|-------:|-------:|-------:|-------:| -|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 | -|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | -|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | -|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 | -|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | -|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | - - -### Llama2 7B (Build with CuBLAS) - -| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | -|------------:|--------------|-------:|-------:|-------:|-------:| -|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 | -|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G | -|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | -|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 | -|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G | -|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | - - -### Mistral 7B v0.1 (Build with CuBLAS) - -| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | -|-------------:|--------------|-------:|-------:|-------:|-------:| -|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 | -|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G | -|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | -|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 | -|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G | -|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | - -### MPT 7B (Build with OpenBLAS) - -| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K | -|---------:|--------------|-------:|-------:|-------:|--------:| -|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 | -|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G | -|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | -|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873| -|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G | -|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 | diff --git a/awq-py/awq/apply_awq.py b/awq-py/awq/apply_awq.py deleted file mode 100644 index 11132c5d2..000000000 --- a/awq-py/awq/apply_awq.py +++ /dev/null @@ -1,254 +0,0 @@ -""" -Implements the AWQ for llama.cpp use cases. -Original paper: https://arxiv.org/abs/2306.00978 - -This code is based on versions of the AWQ implementation found in the following repositories: -* https://github.com/mit-han-lab/llm-awq -* https://github.com/casper-hansen/AutoAWQ -""" - -import os -import torch -import torch.nn as nn - -from transformers import AutoModelForCausalLM, AutoConfig -from transformers.models.bloom.modeling_bloom import BloomGelu -from transformers.models.llama.modeling_llama import LlamaRMSNorm -from transformers.activations import GELUActivation - - -class ScaledActivation(nn.Module): - """ - ScaledActivation module wraps an existing activation function and applies a - scale factor to its output. - - Args: - module (nn.Module): The activation function to be scaled. - scales (torch.Tensor): A tensor of size (num_features,) containing the initial - scale factors for each feature. - - Returns: - torch.Tensor: The scaled output of the activation function. - """ - - def __init__(self, module, scales): - super().__init__() - self.act = module - self.scales = nn.Parameter(scales.data) - - def forward(self, x): - return self.act(x) / self.scales.view(1, 1, -1).to(x.device) - - -def set_op_by_name(layer, name, new_module): - """ - Set the new module for given module's name. - - Args: - layer (nn.Module): The layer in which to replace the submodule. - name (str): The path to the submodule to be replaced, using dot notation - to access nested modules. - new_module (nn.Module): The new module to replace the existing one. - """ - levels = name.split(".") - if len(levels) > 1: - mod_ = layer - for l_idx in range(len(levels) - 1): - if levels[l_idx].isdigit(): - mod_ = mod_[int(levels[l_idx])] - else: - mod_ = getattr(mod_, levels[l_idx]) - setattr(mod_, levels[-1], new_module) - else: - setattr(layer, name, new_module) - - -def get_op_by_name(module, op_name): - """ - Retrieves a submodule within a given layer based on its name. - - Args: - module (nn.Module): The layer containing the submodule to find. - op_name (str): The name of the submodule. - - Returns: - nn.Module: The requested submodule found within the given layer. - - Raises: - ValueError: If the specified submodule cannot be found within the layer. - """ - for name, m in module.named_modules(): - if name == op_name: - return m - raise ValueError(f"Cannot find op {op_name} in module {module}") - - -@torch.no_grad() -def scale_ln_fcs(ln, fcs, scales): - """ - Scales the weights of a LayerNorm and a list of fully-connected layers proportionally. - - Args: - ln (nn.LayerNorm): The LayerNorm module to be scaled. - fcs (List[nn.Linear]): A list of fully-connected layers to be scaled. - scales (torch.Tensor): A 1D tensor of size (num_features,). - """ - - if not isinstance(fcs, list): - fcs = [fcs] - - scales = scales.to(ln.weight.device) - - ln.weight.div_(scales) - if hasattr(ln, "bias") and ln.bias is not None: - ln.bias.div_(scales) - - for fc in fcs: - fc.weight.mul_(scales.view(1, -1)) - - for p in ln.parameters(): - assert torch.isnan(p).sum() == 0 - for fc in fcs: - for p in fc.parameters(): - assert torch.isnan(p).sum() == 0 - - -@torch.no_grad() -def scale_fc_fc(fc1, fc2, scales): - """ - Scales the weights of two fully-connected layers in a specific pattern. - - Args: - fc1 (nn.Linear): The first fully-connected layer to be scaled. - fc2 (nn.Linear): The second fully-connected layer to be scaled. - scales (torch.Tensor): A 1D tensor of size (num_features,). - """ - assert isinstance(fc1, nn.Linear) - assert isinstance(fc2, nn.Linear) - - scales = scales.to(fc1.weight.device) - - fc1.weight[-scales.size(0):].div_(scales.view(-1, 1)) - if fc1.bias is not None: - fc1.bias.div_(scales.view(-1)) - - fc2.weight.mul_(scales.view(1, -1)) - - for p in fc1.parameters(): - assert torch.isnan(p).sum() == 0 - for p in fc2.parameters(): - assert torch.isnan(p).sum() == 0 - - -@torch.no_grad() -def scale_gelu_fc(gelu, fc, scales): - """ - Scales the weight of a GELU activation and a fully-connected layer proportionally. - - Args: - gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled. - fc (nn.Linear): The fully-connected layer to be scaled. - scales (torch.Tensor): A 1D tensor of size (num_features,). - - Raises: - TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`. - TypeError: If the `fc` module is not of type `nn.Linear`. - """ - assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation)) - assert isinstance(fc, nn.Linear) - - fc.weight.mul_(scales.view(1, -1).to(fc.weight.device)) - - for p in fc.parameters(): - assert torch.isnan(p).sum() == 0 - - -def apply_scale(module, scales_list, input_feat_dict=None): - """ - Applies different scaling strategies to layers based on their type and hierarchy within a given module. - - Args: - module (nn.Module): The module containing the layers to be scaled. - scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing: - * prev_op_name (str): The name of the preceding operation or module, - relative to which the layers to be scaled are located. - * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation. - * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature. - input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding - input features (optional). - """ - for prev_op_name, layer_names, scales in scales_list: - prev_op = get_op_by_name(module, prev_op_name) - layers = [get_op_by_name(module, name) for name in layer_names] - - prev_op.cuda() - for layer in layers: - layer.cuda() - scales.cuda() - - if isinstance(prev_op, nn.Linear): - assert len(layers) == 1 - scale_fc_fc(prev_op, layers[0], scales) - elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower(): - scale_ln_fcs(prev_op, layers, scales) - elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)): - new_module = ScaledActivation(prev_op, scales) - set_op_by_name(module, prev_op_name, new_module) - scale_gelu_fc(prev_op, layers[0], scales) - else: - raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!") - - # apply the scaling to input feat if given; prepare it for clipping - if input_feat_dict is not None: - for layer_name in layer_names: - inp = input_feat_dict[layer_name] - inp.div_(scales.view(1, -1).to(inp.device)) - - prev_op.cpu() - for layer in layers: - layer.cpu() - scales.cpu() - - -@torch.no_grad() -def apply_clip(module, clip_list): - """ - Applies element-wise clipping to the weight of a specific layer within a given module. - - Args: - module (nn.Module): The module containing the layer to be clipped. - clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing: - * name (str): The name of the layer to be clipped, relative to the root of the module. - * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight. - """ - for name, max_val in clip_list: - layer = get_op_by_name(module, name) - layer.cuda() - max_val = max_val.to(layer.weight.device) - org_shape = layer.weight.shape - layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) - layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val) - layer.weight.data = layer.weight.data.reshape(org_shape) - layer.cpu() - - -def add_scale_weights(model_path, scale_path, tmp_path): - """ - Adds pre-computed Activation Weight Quantization (AWQ) results to a model, - including scaling factors and clipping bounds. - - Args: - model_path (str): Path to the pre-trained model to be equipped with AWQ. - scale_path (str): Path to the AWQ scale factors (.pt file). - tmp_path (str): Path to the temporary directory where the equipped model will be saved. - """ - config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) - model = AutoModelForCausalLM.from_pretrained( - model_path, config=config, trust_remote_code=True - ) - model.eval() - awq_results = torch.load(str(scale_path), map_location="cpu") - apply_scale(model, awq_results["scale"]) - apply_clip(model, awq_results["clip"]) - model.save_pretrained(str(tmp_path)) - os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}") diff --git a/awq-py/requirements.txt b/awq-py/requirements.txt deleted file mode 100644 index 991896116..000000000 --- a/awq-py/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -torch>=2.1.1 -transformers>=4.32.0 diff --git a/build.zig b/build.zig index 699738f3d..7f36e5968 100644 --- a/build.zig +++ b/build.zig @@ -115,23 +115,27 @@ pub fn build(b: *std.build.Builder) !void { const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); const ggml_quants = make.obj("ggml-quants", "ggml-quants.c"); + const unicode = make.obj("unicode", "unicode.cpp"); + const unicode_data = make.obj("unicode-data", "unicode-data.cpp"); const llama = make.obj("llama", "llama.cpp"); const buildinfo = make.obj("common", "common/build-info.cpp"); const common = make.obj("common", "common/common.cpp"); const console = make.obj("console", "common/console.cpp"); const sampling = make.obj("sampling", "common/sampling.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); + const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp"); const train = make.obj("train", "common/train.cpp"); const clip = make.obj("clip", "examples/llava/clip.cpp"); + const llava = make.obj("llava", "examples/llava/llava.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/ci/README.md b/ci/README.md index 65cfe63eb..406470519 100644 --- a/ci/README.md +++ b/ci/README.md @@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt # with CUDA support GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt + +# with SYCL support +source /opt/intel/oneapi/setvars.sh +GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt ``` diff --git a/ci/run.sh b/ci/run.sh index 47a254f4c..19776b5f7 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -10,6 +10,9 @@ # # with CUDA support # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with SYCL support +# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# if [ -z "$2" ]; then echo "usage: $0 " @@ -22,20 +25,33 @@ mkdir -p "$2" OUT=$(realpath "$1") MNT=$(realpath "$2") -rm -v $OUT/*.log -rm -v $OUT/*.exit -rm -v $OUT/*.md +rm -f "$OUT/*.log" +rm -f "$OUT/*.exit" +rm -f "$OUT/*.md" sd=`dirname $0` cd $sd/../ SRC=`pwd` -CMAKE_EXTRA="" +CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" if [ ! -z ${GG_BUILD_METAL} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON" fi +if [ ! -z ${GG_BUILD_CUDA} ]; then + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1" +fi + +if [ ! -z ${GG_BUILD_SYCL} ]; then + if [ -z ${ONEAPI_ROOT} ]; then + echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:" + echo "source /opt/intel/oneapi/setvars.sh" + exit 1 + fi + + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON" +fi ## helpers # download a file if it does not exist or if it is outdated @@ -90,7 +106,7 @@ function gg_run_ctest_debug { (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e } @@ -119,9 +135,9 @@ function gg_run_ctest_release { (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log if [ -z ${GG_BUILD_LOW_PERF} ]; then - (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log else - (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log fi set +e @@ -137,6 +153,61 @@ function gg_sum_ctest_release { gg_printf '```\n' } +function gg_get_model { + local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf" + local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" + if [[ -s $gguf_3b ]]; then + echo -n "$gguf_3b" + elif [[ -s $gguf_7b ]]; then + echo -n "$gguf_7b" + else + echo >&2 "No model found. Can't run gg_run_ctest_with_model." + exit 1 + fi +} + +function gg_run_ctest_with_model_debug { + cd ${SRC} + + local model; model=$(gg_get_model) + cd build-ci-debug + set -e + (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log + set +e + cd .. +} + +function gg_run_ctest_with_model_release { + cd ${SRC} + + local model; model=$(gg_get_model) + cd build-ci-release + set -e + (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log + set +e + cd .. +} + +function gg_sum_ctest_with_model_debug { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest with model files in debug mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + +function gg_sum_ctest_with_model_release { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest with model files in release mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)" + gg_printf '```\n' +} + # open_llama_3b_v2 function gg_run_open_llama_3b_v2 { @@ -149,7 +220,7 @@ function gg_run_open_llama_3b_v2 { gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json - gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw @@ -160,8 +231,8 @@ function gg_run_open_llama_3b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert.py ${path_models} @@ -202,17 +273,19 @@ function gg_run_open_llama_3b_v2 { (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log @@ -241,6 +314,8 @@ function gg_run_open_llama_3b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log + # lora function compare_ppl { qnt="$1" @@ -269,20 +344,19 @@ function gg_run_open_llama_3b_v2 { python3 ../convert-lora-to-ggml.py ${path_lora} # f16 - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log # q8_0 - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log # q8_0 + f16 lora-base - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - set +e } @@ -292,6 +366,7 @@ function gg_sum_open_llama_3b_v2 { gg_printf 'OpenLLaMA 3B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" @@ -327,7 +402,7 @@ function gg_run_open_llama_7b_v2 { gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json - gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ path_models="../models-mnt/open-llama/7B-v2" @@ -337,8 +412,8 @@ function gg_run_open_llama_7b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert.py ${path_models} @@ -391,6 +466,8 @@ function gg_run_open_llama_7b_v2 { (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { @@ -418,6 +495,8 @@ function gg_run_open_llama_7b_v2 { check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log + # lora function compare_ppl { qnt="$1" @@ -469,6 +548,7 @@ function gg_sum_open_llama_7b_v2 { gg_printf 'OpenLLaMA 7B-v2:\n' gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)" + gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)" gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)" gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" @@ -489,17 +569,69 @@ function gg_sum_open_llama_7b_v2 { #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)" } +# bge-small + +function gg_run_embd_bge_small { + cd ${SRC} + + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json + + gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json + + path_models="../models-mnt/bge-small" + + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + + set -e + + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + + python3 ../convert-hf-to-gguf.py ${path_models} + + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + + (time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + + set +e +} + +function gg_sum_embd_bge_small { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'BGE Small (BERT):\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)" + gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)" +} + ## main if [ -z ${GG_BUILD_LOW_PERF} ]; then + # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt rm -rf ${SRC}/models-mnt - mnt_models=${MNT}/models mkdir -p ${mnt_models} ln -sfn ${mnt_models} ${SRC}/models-mnt - python3 -m pip install -r ${SRC}/requirements.txt - python3 -m pip install --editable gguf-py + # Create a fresh python3 venv and enter it + python3 -m venv "$MNT/venv" + source "$MNT/venv/bin/activate" + + pip install -r ${SRC}/requirements.txt --disable-pip-version-check + pip install --editable gguf-py --disable-pip-version-check fi ret=0 @@ -508,12 +640,16 @@ test $ret -eq 0 && gg_run ctest_debug test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then + test $ret -eq 0 && gg_run embd_bge_small + if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then test $ret -eq 0 && gg_run open_llama_3b_v2 else test $ret -eq 0 && gg_run open_llama_7b_v2 fi + test $ret -eq 0 && gg_run ctest_with_model_debug + test $ret -eq 0 && gg_run ctest_with_model_release fi fi diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index f79acfef1..1d840e5f7 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -19,7 +19,12 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git") endif() endif() - set(GIT_INDEX "${GIT_DIR}/index") + if(EXISTS "${GIT_DIR}/index") + set(GIT_INDEX "${GIT_DIR}/index") + else() + message(WARNING "Git index not found in git repository.") + set(GIT_INDEX "") + endif() else() message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") set(GIT_INDEX "") @@ -42,6 +47,8 @@ if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() +set(TARGET json-schema-to-grammar) +add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h) set(TARGET common) @@ -55,14 +62,28 @@ add_library(${TARGET} STATIC console.cpp grammar-parser.h grammar-parser.cpp + json.hpp train.h train.cpp + ngram-cache.h + ngram-cache.cpp ) if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() +set(LLAMA_COMMON_EXTRA_LIBS build_info) + +# Use curl to download model url +if (LLAMA_CURL) + find_package(CURL REQUIRED) + add_definitions(-DLLAMA_USE_CURL) + include_directories(${CURL_INCLUDE_DIRS}) + find_library(CURL_LIBRARY curl REQUIRED) + set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) +endif () + target_include_directories(${TARGET} PUBLIC .) target_compile_features(${TARGET} PUBLIC cxx_std_11) -target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama) +target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama) diff --git a/common/common.cpp b/common/common.cpp index 4e89fe516..3e2df6e34 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -37,11 +37,37 @@ #include #include #endif +#if defined(LLAMA_USE_CURL) +#include +#include +#include +#include +#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) +#define GGML_USE_CUDA_SYCL +#endif + +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) +#define GGML_USE_CUDA_SYCL_VULKAN +#endif + +#if defined(LLAMA_USE_CURL) +#ifdef __linux__ +#include +#elif defined(_WIN32) +#define PATH_MAX MAX_PATH +#else +#include +#endif +#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 +#define LLAMA_CURL_MAX_HEADER_LENGTH 256 +#endif // LLAMA_USE_CURL + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores @@ -78,7 +104,7 @@ int32_t get_num_physical_cores() { return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -void process_escapes(std::string& input) { +void process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -131,6 +157,1072 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } +bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { + llama_sampling_params& sparams = params.sparams; + + if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.seed = std::stoul(argv[i]); + return true; + } + if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-tb" || arg == "--threads-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_batch = std::stoi(argv[i]); + if (params.n_threads_batch <= 0) { + params.n_threads_batch = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-td" || arg == "--threads-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-p" || arg == "--prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.prompt = argv[i]; + return true; + } + if (arg == "-e" || arg == "--escape") { + params.escape = true; + return true; + } + if (arg == "--prompt-cache") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.path_prompt_cache = argv[i]; + return true; + } + if (arg == "--prompt-cache-all") { + params.prompt_cache_all = true; + return true; + } + if (arg == "--prompt-cache-ro") { + params.prompt_cache_ro = true; + return true; + } + if (arg == "-bf" || arg == "--binary-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); + return true; + } + if (arg == "-f" || arg == "--file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + return true; + } + if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_predict = std::stoi(argv[i]); + return true; + } + if (arg == "--top-k") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.top_k = std::stoi(argv[i]); + return true; + } + if (arg == "-c" || arg == "--ctx-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-n" || arg == "-gan") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.grp_attn_n = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.grp_attn_w = std::stoi(argv[i]); + return true; + } + if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_base = std::stof(argv[i]); + return true; + } + if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_scale = std::stof(argv[i]); + return true; + } + if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { invalid_param = true; } + return true; + } + if (arg == "--rope-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_scale = 1.0f / std::stof(argv[i]); + return true; + } + if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_orig_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_ext_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_attn_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_beta_fast = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_beta_slow = std::stof(argv[i]); + return true; + } + if (arg == "--pooling") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else { invalid_param = true; } + return true; + } + if (arg == "--defrag-thold" || arg == "-dt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.defrag_thold = std::stof(argv[i]); + return true; + } + if (arg == "--samplers") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); + return true; + } + if (arg == "--sampling-seq") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.samplers_sequence = sampler_types_from_chars(argv[i]); + return true; + } + if (arg == "--top-p") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.top_p = std::stof(argv[i]); + return true; + } + if (arg == "--min-p") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.min_p = std::stof(argv[i]); + return true; + } + if (arg == "--temp") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.temp = std::stof(argv[i]); + sparams.temp = std::max(sparams.temp, 0.0f); + return true; + } + if (arg == "--tfs") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.tfs_z = std::stof(argv[i]); + return true; + } + if (arg == "--typical") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.typical_p = std::stof(argv[i]); + return true; + } + if (arg == "--repeat-last-n") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_last_n = std::stoi(argv[i]); + sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); + return true; + } + if (arg == "--repeat-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_repeat = std::stof(argv[i]); + return true; + } + if (arg == "--frequency-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_freq = std::stof(argv[i]); + return true; + } + if (arg == "--presence-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_present = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-range") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.dynatemp_range = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-exp") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.dynatemp_exponent = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat = std::stoi(argv[i]); + return true; + } + if (arg == "--mirostat-lr") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat_eta = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat-ent") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat_tau = std::stof(argv[i]); + return true; + } + if (arg == "--cfg-negative-prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.cfg_negative_prompt = argv[i]; + return true; + } + if (arg == "--cfg-negative-prompt-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); + if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { + sparams.cfg_negative_prompt.pop_back(); + } + return true; + } + if (arg == "--cfg-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.cfg_scale = std::stof(argv[i]); + return true; + } + if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_batch = std::stoi(argv[i]); + return true; + } + if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_ubatch = std::stoi(argv[i]); + return true; + } + if (arg == "--keep") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_keep = std::stoi(argv[i]); + return true; + } + if (arg == "--draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_draft = std::stoi(argv[i]); + return true; + } + if (arg == "--chunks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_chunks = std::stoi(argv[i]); + return true; + } + if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_parallel = std::stoi(argv[i]); + return true; + } + if (arg == "-ns" || arg == "--sequences") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_sequences = std::stoi(argv[i]); + return true; + } + if (arg == "--p-split" || arg == "-ps") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.p_split = std::stof(argv[i]); + return true; + } + if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model = argv[i]; + return true; + } + if (arg == "-md" || arg == "--model-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_draft = argv[i]; + return true; + } + if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_alias = argv[i]; + return true; + } + if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_url = argv[i]; + return true; + } + if (arg == "-hfr" || arg == "--hf-repo") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hf_repo = argv[i]; + return true; + } + if (arg == "-hff" || arg == "--hf-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hf_file = argv[i]; + return true; + } + if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_adapter.emplace_back(argv[i], 1.0f); + params.use_mmap = false; + return true; + } + if (arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const char* lora_adapter = argv[i]; + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.use_mmap = false; + return true; + } + if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_base = argv[i]; + return true; + } + if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + return true; + } + if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const char* fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + return true; + } + if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vector_layer_end = std::stoi(argv[i]); + return true; + } + if (arg == "--mmproj") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.mmproj = argv[i]; + return true; + } + if (arg == "--image") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.image = argv[i]; + return true; + } + if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + return true; + } + if (arg == "--embedding") { + params.embedding = true; + return true; + } + if (arg == "--interactive-first") { + params.interactive_first = true; + return true; + } + if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; + return true; + } + if (arg == "-cml" || arg == "--chatml") { + params.chatml = true; + return true; + } + if (arg == "--infill") { + params.infill = true; + return true; + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + params.dump_kv_cache = true; + return true; + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; + return true; + } + if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + return true; + } + if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; + return true; + } + if (arg == "--multiline-input") { + params.multiline_input = true; + return true; + } + if (arg == "--simple-io") { + params.simple_io = true; + return true; + } + if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; + return true; + } + if (arg == "--color") { + params.use_color = true; + return true; + } + if (arg == "--mlock") { + params.use_mlock = true; + return true; + } + if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_gpu_layers = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + return true; + } + if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_gpu_layers_draft = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + return true; + } + if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.main_gpu = std::stoi(argv[i]); +#ifndef GGML_USE_CUDA_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL + return true; + } + if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } + else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + invalid_param = true; + return true; + } +#ifndef GGML_USE_CUDA_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL + return true; + } + if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + invalid_param = true; + return true; + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } + else { + params.tensor_split[i] = 0.0f; + } + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + return true; + } + if (arg == "--no-mmap") { + params.use_mmap = false; + return true; + } + if (arg == "--numa") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; } + return true; + } + if (arg == "--verbose-prompt") { + params.verbose_prompt = true; + return true; + } + if (arg == "--no-display-prompt") { + params.display_prompt = false; + return true; + } + if (arg == "-r" || arg == "--reverse-prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.antiprompt.emplace_back(argv[i]); + return true; + } + if (arg == "-ld" || arg == "--logdir") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.logdir = argv[i]; + + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + return true; + } + if (arg == "-lcs" || arg == "--lookup-cache-static") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lookup_cache_static = argv[i]; + return true; + } + if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lookup_cache_dynamic = argv[i]; + return true; + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.logits_file = argv[i]; + return true; + } + if (arg == "--perplexity" || arg == "--all-logits") { + params.logits_all = true; + return true; + } + if (arg == "--ppl-stride") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ppl_stride = std::stoi(argv[i]); + return true; + } + if (arg == "-ptc" || arg == "--print-token-count") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_print = std::stoi(argv[i]); + return true; + } + if (arg == "--ppl-output-type") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ppl_output_type = std::stoi(argv[i]); + return true; + } + if (arg == "--hellaswag") { + params.hellaswag = true; + return true; + } + if (arg == "--hellaswag-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hellaswag_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--winogrande") { + params.winogrande = true; + return true; + } + if (arg == "--winogrande-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.winogrande_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--multiple-choice") { + params.multiple_choice = true; + return true; + } + if (arg == "--multiple-choice-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.multiple_choice_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--kl-divergence") { + params.kl_divergence = true; + return true; + } + if (arg == "--ignore-eos") { + params.ignore_eos = true; + return true; + } + if (arg == "--penalize-nl") { + sparams.penalize_nl = true; + return true; + } + if (arg == "-l" || arg == "--logit-bias") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::stringstream ss(argv[i]); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + } + else { + throw std::exception(); + } + } + catch (const std::exception&) { + invalid_param = true; + return true; + } + return true; + } + if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, gpt_params()); + exit(0); + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + if (arg == "--random-prompt") { + params.random_prompt = true; + return true; + } + if (arg == "--in-prefix-bos") { + params.input_prefix_bos = true; + return true; + } + if (arg == "--in-prefix") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.input_prefix = argv[i]; + return true; + } + if (arg == "--in-suffix") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.input_suffix = argv[i]; + return true; + } + if (arg == "--grammar") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.grammar = argv[i]; + return true; + } + if (arg == "--grammar-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(sparams.grammar) + ); + return true; + } + if (arg == "--override-kv") { + if (++i >= argc) { + invalid_param = true; + return true; + } + char* sep = strchr(argv[i], '='); + if (sep == nullptr || sep - argv[i] >= 128) { + fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + struct llama_model_kv_override kvo; + std::strncpy(kvo.key, argv[i], sep - argv[i]); + kvo.key[sep - argv[i]] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.int_value = std::atol(sep); + } + else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.float_value = std::atof(sep); + } + else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.bool_value = true; + } + else if (std::strcmp(sep, "false") == 0) { + kvo.bool_value = false; + } + else { + fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + } + else { + fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + params.kv_overrides.push_back(kvo); + return true; + } +#ifndef LOG_DISABLE_LOGS + // Parse args for logging parameters + if (log_param_single_parse(argv[i])) { + // Do nothing, log_param_single_parse automatically does it's thing + // and returns if a match was found and parsed. + return true; + } + if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { + // We have a matching known parameter requiring an argument, + // now we need to check if there is anything after this argv + // and flag invalid_param or parse it. + if (++i >= argc) { + invalid_param = true; + return true; + } + if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { + invalid_param = true; + return true; + } + return true; + } + // End of Parse args for logging parameters +#endif // LOG_DISABLE_LOGS + + return false; +} + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; @@ -143,642 +1235,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::replace(arg.begin(), arg.end(), '_', '-'); } - if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.seed = std::stoul(argv[i]); - } else if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - if (params.n_threads <= 0) { - params.n_threads = std::thread::hardware_concurrency(); - } - } else if (arg == "-tb" || arg == "--threads-batch") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch = std::stoi(argv[i]); - if (params.n_threads_batch <= 0) { - params.n_threads_batch = std::thread::hardware_concurrency(); - } - } else if (arg == "-p" || arg == "--prompt") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.prompt = argv[i]; - } else if (arg == "-e" || arg == "--escape") { - params.escape = true; - } else if (arg == "--prompt-cache") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.path_prompt_cache = argv[i]; - } else if (arg == "--prompt-cache-all") { - params.prompt_cache_all = true; - } else if (arg == "--prompt-cache-ro") { - params.prompt_cache_ro = true; - } else if (arg == "-f" || arg == "--file") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } else if (arg == "-n" || arg == "--n-predict") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_predict = std::stoi(argv[i]); - } else if (arg == "--top-k") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.top_k = std::stoi(argv[i]); - } else if (arg == "-c" || arg == "--ctx-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--grp-attn-n" || arg == "-gan") { - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_n = std::stoi(argv[i]); - } else if (arg == "--grp-attn-w" || arg == "-gaw") { - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_w = std::stoi(argv[i]); - } else if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--rope-scaling") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } - else { invalid_param = true; break; } - } else if (arg == "--rope-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } else if (arg == "--yarn-orig-ctx") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_orig_ctx = std::stoi(argv[i]); - } else if (arg == "--yarn-ext-factor") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_ext_factor = std::stof(argv[i]); - } else if (arg == "--yarn-attn-factor") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_attn_factor = std::stof(argv[i]); - } else if (arg == "--yarn-beta-fast") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_fast = std::stof(argv[i]); - } else if (arg == "--yarn-beta-slow") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--samplers") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.samplers_sequence = parse_samplers_input(argv[i]); - } else if (arg == "--sampling-seq") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.samplers_sequence = argv[i]; - } else if (arg == "--top-p") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.top_p = std::stof(argv[i]); - } else if (arg == "--min-p") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.min_p = std::stof(argv[i]); - } else if (arg == "--temp") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.temp = std::stof(argv[i]); - sparams.temp = std::max(sparams.temp, 0.0f); - } else if (arg == "--tfs") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.tfs_z = std::stof(argv[i]); - } else if (arg == "--typical") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.typical_p = std::stof(argv[i]); - } else if (arg == "--repeat-last-n") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_last_n = std::stoi(argv[i]); - sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - } else if (arg == "--repeat-penalty") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_repeat = std::stof(argv[i]); - } else if (arg == "--frequency-penalty") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_freq = std::stof(argv[i]); - } else if (arg == "--presence-penalty") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_present = std::stof(argv[i]); - } else if (arg == "--mirostat") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat = std::stoi(argv[i]); - } else if (arg == "--mirostat-lr") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat_eta = std::stof(argv[i]); - } else if (arg == "--mirostat-ent") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat_tau = std::stof(argv[i]); - } else if (arg == "--cfg-negative-prompt") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.cfg_negative_prompt = argv[i]; - } else if (arg == "--cfg-negative-prompt-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); - if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { - sparams.cfg_negative_prompt.pop_back(); - } - } else if (arg == "--cfg-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.cfg_scale = std::stof(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - } else if (arg == "--keep") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_keep = std::stoi(argv[i]); - } else if (arg == "--draft") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_draft = std::stoi(argv[i]); - } else if (arg == "--chunks") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_chunks = std::stoi(argv[i]); - } else if (arg == "-np" || arg == "--parallel") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-ns" || arg == "--sequences") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_sequences = std::stoi(argv[i]); - } else if (arg == "--p-accept" || arg == "-pa") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.p_accept = std::stof(argv[i]); - } else if (arg == "--p-split" || arg == "-ps") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.p_split = std::stof(argv[i]); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } else if (arg == "-md" || arg == "--model-draft") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_draft = argv[i]; - } else if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } else if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); - params.use_mmap = false; - } else if (arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - break; - } - const char * lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i]))); - params.use_mmap = false; - } else if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "--mmproj") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.mmproj = argv[i]; - } else if (arg == "--image") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.image = argv[i]; - } else if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - } else if (arg == "--embedding") { - params.embedding = true; - } else if (arg == "--interactive-first") { - params.interactive_first = true; - } else if (arg == "-ins" || arg == "--instruct") { - params.instruct = true; - } else if (arg == "-cml" || arg == "--chatml") { - params.chatml = true; - } else if (arg == "--infill") { - params.infill = true; - } else if (arg == "-dkvc" || arg == "--dump-kv-cache") { - params.dump_kv_cache = true; - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { - params.no_kv_offload = true; - } else if (arg == "-ctk" || arg == "--cache-type-k") { - params.cache_type_k = argv[++i]; - } else if (arg == "-ctv" || arg == "--cache-type-v") { - params.cache_type_v = argv[++i]; - } else if (arg == "--multiline-input") { - params.multiline_input = true; - } else if (arg == "--simple-io") { - params.simple_io = true; - } else if (arg == "-cb" || arg == "--cont-batching") { - params.cont_batching = true; - } else if (arg == "--color") { - params.use_color = true; - } else if (arg == "--mlock") { - params.use_mlock = true; - } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif - } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { - if (++i >= argc) { - invalid_param = true; - break; - } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers_draft = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif - } else if (arg == "--main-gpu" || arg == "-mg") { - if (++i >= argc) { - invalid_param = true; - break; - } -#ifdef GGML_USE_CUBLAS - params.main_gpu = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); -#endif - } else if (arg == "--tensor-split" || arg == "-ts") { - if (++i >= argc) { - invalid_param = true; - break; - } -#ifdef GGML_USE_CUBLAS - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } else { - params.tensor_split[i] = 0.0f; - } - } -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { -#ifdef GGML_USE_CUBLAS - params.mul_mat_q = false; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); -#endif // GGML_USE_CUBLAS - } else if (arg == "--no-mmap") { - params.use_mmap = false; - } else if (arg == "--numa") { - params.numa = true; - } else if (arg == "--verbose-prompt") { - params.verbose_prompt = true; - } else if (arg == "-r" || arg == "--reverse-prompt") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.antiprompt.push_back(argv[i]); - } else if (arg == "-ld" || arg == "--logdir") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.logdir = argv[i]; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - } else if (arg == "--perplexity" || arg == "--all-logits") { - params.logits_all = true; - } else if (arg == "--ppl-stride") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.ppl_stride = std::stoi(argv[i]); - } else if (arg == "--ppl-output-type") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.ppl_output_type = std::stoi(argv[i]); - } else if (arg == "--hellaswag") { - params.hellaswag = true; - } else if (arg == "--hellaswag-tasks") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.hellaswag_tasks = std::stoi(argv[i]); - } else if (arg == "--ignore-eos") { - params.ignore_eos = true; - } else if (arg == "--no-penalize-nl") { - sparams.penalize_nl = false; - } else if (arg == "-l" || arg == "--logit-bias") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::stringstream ss(argv[i]); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - } else { - throw std::exception(); - } - } catch (const std::exception&) { - invalid_param = true; - break; - } - } else if (arg == "-h" || arg == "--help") { - return false; - - } else if (arg == "--version") { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } else if (arg == "--random-prompt") { - params.random_prompt = true; - } else if (arg == "--in-prefix-bos") { - params.input_prefix_bos = true; - } else if (arg == "--in-prefix") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.input_prefix = argv[i]; - } else if (arg == "--in-suffix") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.input_suffix = argv[i]; - } else if (arg == "--grammar") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.grammar = argv[i]; - } else if (arg == "--grammar-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(sparams.grammar) - ); - } else if (arg == "--override-kv") { - if (++i >= argc) { - invalid_param = true; - break; - } - char * sep = strchr(argv[i], '='); - if (sep == nullptr || sep - argv[i] >= 128) { - fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - struct llama_model_kv_override kvo; - std::strncpy(kvo.key, argv[i], sep - argv[i]); - kvo.key[sep - argv[i]] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_INT; - kvo.int_value = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; - kvo.float_value = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } else { - fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - } else { - fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - params.kv_overrides.push_back(kvo); -#ifndef LOG_DISABLE_LOGS - // Parse args for logging parameters - } else if ( log_param_single_parse( argv[i] ) ) { - // Do nothing, log_param_single_parse automatically does it's thing - // and returns if a match was found and parsed. - } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { - // We have a matching known parameter requiring an argument, - // now we need to check if there is anything after this argv - // and flag invalid_param or parse it. - if (++i >= argc) { - invalid_param = true; - break; - } - if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) { - invalid_param = true; - break; - } - // End of Parse args for logging parameters -#endif // LOG_DISABLE_LOGS - } else { + if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { throw std::invalid_argument("error: unknown argument: " + arg); } } + if (invalid_param) { throw std::invalid_argument("error: invalid parameter for argument: " + arg); } + if (params.prompt_cache_all && (params.interactive || params.interactive_first || params.instruct)) { @@ -786,6 +1251,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } + // short-hand to avoid specifying --hf-file -> default it to --model + if (!params.hf_repo.empty() && params.hf_file.empty()) { + params.hf_file = params.model; + } + if (params.escape) { process_escapes(params.prompt); process_escapes(params.input_prefix); @@ -797,7 +1267,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(llama_model_kv_override()); + params.kv_overrides.emplace_back(); params.kv_overrides.back().key[0] = 0; } @@ -807,12 +1277,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto sampler_type : sparams.samplers_sequence) { + sampler_type_chars += static_cast(sampler_type); + sampler_type_names += sampler_type_to_name_string(sampler_type) + ";"; + } + sampler_type_names.pop_back(); + printf("\n"); printf("usage: %s [options]\n", argv[0]); printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" --version show version and build info\n"); + printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); @@ -826,6 +1304,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -td N, --threads-draft N"); + printf(" number of threads to use during generation (default: same as --threads)\n"); + printf(" -tbd N, --threads-batch-draft N\n"); + printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); printf(" prompt to start generation with (default: empty)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); @@ -839,11 +1321,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" -f FNAME, --file FNAME\n"); printf(" prompt file to start generation.\n"); + printf(" -bf FNAME, --binary-file FNAME\n"); + printf(" binary file containing multiple choice tasks.\n"); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); - printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n"); - printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str()); + printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch); + printf(" -ub N, --ubatch-size N\n"); + printf(" physical maximum batch size (default: %d)\n", params.n_ubatch); + printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); + printf(" (default: %s)\n", sampler_type_names.c_str()); + printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p); @@ -853,6 +1340,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq); + printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range); + printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent); printf(" --mirostat N use Mirostat sampling.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); @@ -879,50 +1368,63 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); + printf(" --pooling {none,mean,cls}\n"); + printf(" pooling type for embeddings, use model default if unspecified\n"); + printf(" -dt N, --defrag-thold N\n"); + printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - printf(" --no-penalize-nl do not penalize newline token\n"); + printf(" --penalize-nl penalize newline tokens\n"); printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); - printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); + printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); + printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); + printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); + printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); + printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); + printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n"); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); - printf(" -pa N, --p-accept N speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept); printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); - if (llama_mlock_supported()) { + if (llama_supports_mlock()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } - if (llama_mmap_supported()) { + if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - printf(" --numa attempt optimizations that help on some NUMA systems\n"); + printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); + printf(" - distribute: spread execution evenly over all nodes\n"); + printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); + printf(" - numactl: use the CPU map provided by numactl\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -ngld N, --n-gpu-layers-draft N\n"); - printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); -#ifdef GGML_USE_CUBLAS - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); -#endif // GGML_USE_CUBLAS -#endif + if (llama_supports_gpu_offload()) { + printf(" -ngl N, --n-gpu-layers N\n"); + printf(" number of layers to store in VRAM\n"); + printf(" -ngld N, --n-gpu-layers-draft N\n"); + printf(" number of layers to store in VRAM for the draft model\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); + printf(" -ts SPLIT, --tensor-split SPLIT\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); + } + printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false"); + printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false"); printf(" -gan N, --grp-attn-n N\n"); printf(" group-attention factor (default: %d)\n", params.grp_attn_n); printf(" -gaw N, --grp-attn-w N\n"); printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w); - printf(" --verbose-prompt print prompt before generation\n"); printf(" -dkvc, --dump-kv-cache\n"); printf(" verbose print of the KV cache\n"); printf(" -nkvo, --no-kv-offload\n"); @@ -935,15 +1437,33 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --control-vector FNAME\n"); + printf(" add a control vector\n"); + printf(" --control-vector-scaled FNAME S\n"); + printf(" add a control vector with user defined scaling S\n"); + printf(" --control-vector-layer-range START END\n"); + printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); printf(" -md FNAME, --model-draft FNAME\n"); - printf(" draft model for speculative decoding\n"); + printf(" draft model for speculative decoding (default: unused)\n"); + printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); + printf(" model download url (default: unused)\n"); + printf(" -hfr REPO, --hf-repo REPO\n"); + printf(" Hugging Face model repository (default: unused)\n"); + printf(" -hff FILE, --hf-file FILE\n"); + printf(" Hugging Face model file (default: unused)\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); + printf(" -lcs FNAME, --lookup-cache-static FNAME\n"); + printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n"); + printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n"); + printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" -ptc N, --print-token-count N\n"); + printf(" print token count every N tokens (default: %d)\n", params.n_print); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); @@ -981,45 +1501,101 @@ std::string gpt_random_prompt(std::mt19937 & rng) { } // -// String parsing +// String utils // -std::string parse_samplers_input(std::string input) { - std::string output = ""; +std::vector string_split(std::string input, char separator) { + std::vector parts; + size_t separator_pos = input.find(separator); + while (separator_pos != std::string::npos) { + std::string part = input.substr(0, separator_pos); + parts.emplace_back(part); + input = input.substr(separator_pos + 1); + separator_pos = input.find(separator); + } + parts.emplace_back(input); + return parts; +} + +std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) { + std::unordered_map sampler_canonical_name_map { + {"top_k", llama_sampler_type::TOP_K}, + {"top_p", llama_sampler_type::TOP_P}, + {"typical_p", llama_sampler_type::TYPICAL_P}, + {"min_p", llama_sampler_type::MIN_P}, + {"tfs_z", llama_sampler_type::TFS_Z}, + {"temperature", llama_sampler_type::TEMPERATURE} + }; + // since samplers names are written multiple ways // make it ready for both system names and input names - std::unordered_map samplers_symbols { - {"top_k", 'k'}, - {"top-k", 'k'}, - {"top_p", 'p'}, - {"top-p", 'p'}, - {"nucleus", 'p'}, - {"typical_p", 'y'}, - {"typical-p", 'y'}, - {"typical", 'y'}, - {"min_p", 'm'}, - {"min-p", 'm'}, - {"tfs_z", 'f'}, - {"tfs-z", 'f'}, - {"tfs", 'f'}, - {"temp", 't'}, - {"temperature",'t'} + std::unordered_map sampler_alt_name_map { + {"top-k", llama_sampler_type::TOP_K}, + {"top-p", llama_sampler_type::TOP_P}, + {"nucleus", llama_sampler_type::TOP_P}, + {"typical-p", llama_sampler_type::TYPICAL_P}, + {"typical", llama_sampler_type::TYPICAL_P}, + {"min-p", llama_sampler_type::MIN_P}, + {"tfs-z", llama_sampler_type::TFS_Z}, + {"tfs", llama_sampler_type::TFS_Z}, + {"temp", llama_sampler_type::TEMPERATURE} }; - // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p" - size_t separator = input.find(';'); - while (separator != input.npos) { - std::string name = input.substr(0,separator); - input = input.substr(separator+1); - separator = input.find(';'); - if (samplers_symbols.find(name) != samplers_symbols.end()) { - output += samplers_symbols[name]; + std::vector sampler_types; + sampler_types.reserve(names.size()); + for (const auto & name : names) + { + auto sampler_item = sampler_canonical_name_map.find(name); + if (sampler_item != sampler_canonical_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + else + { + if (allow_alt_names) + { + sampler_item = sampler_alt_name_map.find(name); + if (sampler_item != sampler_alt_name_map.end()) + { + sampler_types.push_back(sampler_item->second); + } + } } } - if (samplers_symbols.find(input) != samplers_symbols.end()) { - output += samplers_symbols[input]; + return sampler_types; +} + +std::vector sampler_types_from_chars(const std::string & names_string) { + std::unordered_map sampler_name_map { + {'k', llama_sampler_type::TOP_K}, + {'p', llama_sampler_type::TOP_P}, + {'y', llama_sampler_type::TYPICAL_P}, + {'m', llama_sampler_type::MIN_P}, + {'f', llama_sampler_type::TFS_Z}, + {'t', llama_sampler_type::TEMPERATURE} + }; + + std::vector sampler_types; + sampler_types.reserve(names_string.size()); + for (const auto & c : names_string) { + const auto sampler_item = sampler_name_map.find(c); + if (sampler_item != sampler_name_map.end()) { + sampler_types.push_back(sampler_item->second); + } + } + return sampler_types; +} + +std::string sampler_type_to_name_string(llama_sampler_type sampler_type) { + switch (sampler_type) { + case llama_sampler_type::TOP_K: return "top_k"; + case llama_sampler_type::TFS_Z: return "tfs_z"; + case llama_sampler_type::TYPICAL_P: return "typical_p"; + case llama_sampler_type::TOP_P: return "top_p"; + case llama_sampler_type::MIN_P: return "min_p"; + case llama_sampler_type::TEMPERATURE: return "temperature"; + default : return ""; } - return output; } // @@ -1033,6 +1609,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.n_gpu_layers = params.n_gpu_layers; } mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; @@ -1047,6 +1624,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & } static ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f32") { + return GGML_TYPE_F32; + } if (s == "f16") { return GGML_TYPE_F16; } @@ -1059,6 +1639,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) { if (s == "q4_1") { return GGML_TYPE_Q4_1; } + if (s == "iq4_nl") { + return GGML_TYPE_IQ4_NL; + } if (s == "q5_0") { return GGML_TYPE_Q5_0; } @@ -1073,13 +1656,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param auto cparams = llama_context_default_params(); cparams.n_ctx = params.n_ctx; + cparams.n_seq_max = params.n_parallel; cparams.n_batch = params.n_batch; + cparams.n_ubatch = params.n_ubatch; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; cparams.seed = params.seed; cparams.logits_all = params.logits_all; - cparams.embedding = params.embedding; + cparams.embeddings = params.embedding; cparams.rope_scaling_type = params.rope_scaling_type; cparams.rope_freq_base = params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale; @@ -1088,6 +1672,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.yarn_orig_ctx = params.yarn_orig_ctx; + cparams.pooling_type = params.pooling_type; + cparams.defrag_thold = params.defrag_thold; cparams.offload_kqv = !params.no_kv_offload; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); @@ -1117,10 +1703,364 @@ void llama_batch_add( batch.n_tokens++; } +#ifdef LLAMA_USE_CURL + +static bool llama_download_file(CURL * curl, const char * url, const char * path) { + bool force_download = false; + + // Set the URL, allow to follow http redirection + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + +#if defined(_WIN32) + // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of + // operating system. Currently implemented under MS-Windows. + curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + + // Check if the file already exists locally + struct stat model_file_info; + auto file_exists = (stat(path, &model_file_info) == 0); + + // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files + char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char etag_path[PATH_MAX] = {0}; + snprintf(etag_path, sizeof(etag_path), "%s.etag", path); + + char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char last_modified_path[PATH_MAX] = {0}; + snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path); + + if (file_exists) { + auto * f_etag = fopen(etag_path, "r"); + if (f_etag) { + if (!fgets(etag, sizeof(etag), f_etag)) { + fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); + } else { + fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag); + } + fclose(f_etag); + } + + auto * f_last_modified = fopen(last_modified_path, "r"); + if (f_last_modified) { + if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) { + fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path); + } else { + fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path, + last_modified); + } + fclose(f_last_modified); + } + } + + // Send a HEAD request to retrieve the etag and last-modified headers + struct llama_load_model_from_url_headers { + char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + }; + llama_load_model_from_url_headers headers; + { + typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); + auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { + llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; + + // Convert header field name to lowercase + for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) { + buffer[i] = tolower(buffer[i]); + } + + const char * etag_prefix = "etag: "; + if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) { + strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF + } + + const char * last_modified_prefix = "last-modified: "; + if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) { + strncpy(headers->last_modified, buffer + strlen(last_modified_prefix), + n_items - strlen(last_modified_prefix) - 2); // Remove CRLF + } + return n_items; + }; + + curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast(header_callback)); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + return false; + } + + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + if (http_code != 200) { + // HEAD not supported, we don't know if the file has changed + // force trigger downloading + force_download = true; + fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); + } + } + + // If the ETag or the Last-Modified headers are different: trigger a new download + bool should_download = !file_exists + || force_download + || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0) + || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0); + if (should_download) { + char path_temporary[PATH_MAX] = {0}; + snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path); + if (file_exists) { + fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path); + if (remove(path) != 0) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path); + return false; + } + } + + // Set the output file + auto * outfile = fopen(path_temporary, "wb"); + if (!outfile) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path); + return false; + } + + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); + auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { + return fwrite(data, size, nmemb, (FILE *)fd); + }; + curl_easy_setopt(curl, CURLOPT_NOBODY, 0L); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile); + + // display download progress + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + + // helper function to hide password in URL + auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { + std::size_t protocol_pos = url.find("://"); + if (protocol_pos == std::string::npos) { + return url; // Malformed URL + } + + std::size_t at_pos = url.find('@', protocol_pos + 3); + if (at_pos == std::string::npos) { + return url; // No password in URL + } + + return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); + }; + + // start the download + fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified); + auto res = curl_easy_perform(curl); + if (res != CURLE_OK) { + fclose(outfile); + curl_easy_cleanup(curl); + fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); + return false; + } + + long http_code = 0; + curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code); + if (http_code < 200 || http_code >= 400) { + fclose(outfile); + curl_easy_cleanup(curl); + fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); + return false; + } + + // Clean up + fclose(outfile); + + // Write the new ETag to the .etag file + if (strlen(headers.etag) > 0) { + auto * etag_file = fopen(etag_path, "w"); + if (etag_file) { + fputs(headers.etag, etag_file); + fclose(etag_file); + fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag); + } + } + + // Write the new lastModified to the .etag file + if (strlen(headers.last_modified) > 0) { + auto * last_modified_file = fopen(last_modified_path, "w"); + if (last_modified_file) { + fputs(headers.last_modified, last_modified_file); + fclose(last_modified_file); + fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path, + headers.last_modified); + } + } + + if (rename(path_temporary, path) != 0) { + curl_easy_cleanup(curl); + fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path); + return false; + } + } + + return true; +} + +struct llama_model * llama_load_model_from_url( + const char * model_url, + const char * path_model, + const struct llama_model_params & params) { + // Basic validation of the model_url + if (!model_url || strlen(model_url) == 0) { + fprintf(stderr, "%s: invalid model_url\n", __func__); + return NULL; + } + + // Initialize libcurl + auto * curl = curl_easy_init(); + + if (!curl) { + fprintf(stderr, "%s: error initializing libcurl\n", __func__); + return NULL; + } + + if (!llama_download_file(curl, model_url, path_model)) { + return NULL; + } + + // check for additional GGUFs split to download + int n_split = 0; + { + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, + }; + auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); + curl_easy_cleanup(curl); + return NULL; + } + + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); + if (key_n_split >= 0) { + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); + } + + gguf_free(ctx_gguf); + } + + curl_easy_cleanup(curl); + + if (n_split > 1) { + char split_prefix[PATH_MAX] = {0}; + char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + + // Verify the first split file format + // and extract split URL and PATH prefixes + { + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model file name: %s" + " n_split=%d\n", __func__, path_model, n_split); + return NULL; + } + + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model url: %s" + " n_split=%d\n", __func__, model_url, n_split); + return NULL; + } + } + + // Prepare download in parallel + std::vector> futures_download; + for (int idx = 1; idx < n_split; idx++) { + futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool { + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); + + char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); + + auto * curl = curl_easy_init(); + bool res = llama_download_file(curl, split_url, split_path); + curl_easy_cleanup(curl); + + return res; + }, idx)); + } + + // Wait for all downloads to complete + for (auto & f : futures_download) { + if (!f.get()) { + return NULL; + } + } + } + + return llama_load_model_from_file(path_model, params); +} + +struct llama_model * llama_load_model_from_hf( + const char * repo, + const char * model, + const char * path_model, + const struct llama_model_params & params) { + // construct hugging face model url: + // + // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf + // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf + // + // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf + // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf + // + + std::string model_url = "https://huggingface.co/"; + model_url += repo; + model_url += "/resolve/main/"; + model_url += model; + + return llama_load_model_from_url(model_url.c_str(), path_model, params); +} + +#else + +struct llama_model * llama_load_model_from_url( + const char * /*model_url*/, + const char * /*path_model*/, + const struct llama_model_params & /*params*/) { + fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); + return nullptr; +} + +struct llama_model * llama_load_model_from_hf( + const char * /*repo*/, + const char * /*model*/, + const char * /*path_model*/, + const struct llama_model_params & /*params*/) { + fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + return nullptr; +} + +#endif // LLAMA_USE_CURL + std::tuple llama_init_from_gpt_params(gpt_params & params) { auto mparams = llama_model_params_from_gpt_params(params); - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); + llama_model * model = nullptr; + + if (!params.hf_repo.empty() && !params.hf_file.empty()) { + model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); + } else if (!params.model_url.empty()) { + model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); + } else { + model = llama_load_model_from_file(params.model.c_str(), mparams); + } + if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return std::make_tuple(nullptr, nullptr); @@ -1135,8 +2075,32 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } + if (!params.control_vectors.empty()) { + if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1; + if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model); + + const auto cvec = llama_control_vector_load(params.control_vectors); + if (cvec.n_embd == -1) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + + int err = llama_control_vector_apply(lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); + } + } + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); + const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); int err = llama_model_apply_lora_from_file(model, lora_adapter.c_str(), @@ -1163,6 +2127,7 @@ std::tuple llama_init_from_gpt_par std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_kv_cache_clear(lctx); + llama_synchronize(lctx); llama_reset_timings(lctx); } @@ -1417,9 +2382,10 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); - fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); + fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); @@ -1429,6 +2395,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false"); fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false"); fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false"); + fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false"); #ifdef NDEBUG fprintf(stream, "debug: false\n"); @@ -1505,6 +2472,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); + fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); @@ -1516,9 +2484,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); - fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); - fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); - fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); + fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); @@ -1543,21 +2509,22 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base); fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale); - fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); + fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); - const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); + const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); - fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); + fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency()); fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); + fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); } // @@ -1568,17 +2535,17 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); + view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); llama_kv_cache_view_cell * c_curr = view.cells; llama_seq_id * cs_curr = view.cells_sequences; - for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { if (i % row_size == 0) { printf("\n%5d: ", i); } int seq_count = 0; - for (int j = 0; j < view.n_max_seq; j++) { + for (int j = 0; j < view.n_seq_max; j++) { if (cs_curr[j] >= 0) { seq_count++; } } putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]); @@ -1591,18 +2558,19 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n", - view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); + view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx); std::unordered_map seqs; llama_kv_cache_view_cell * c_curr = view.cells; llama_seq_id * cs_curr = view.cells_sequences; - for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { - for (int j = 0; j < view.n_max_seq; j++) { + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { + for (int j = 0; j < view.n_seq_max; j++) { if (cs_curr[j] < 0) { continue; } if (seqs.find(cs_curr[j]) == seqs.end()) { if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } - seqs[cs_curr[j]] = seqs.size(); + const size_t sz = seqs.size(); + seqs[cs_curr[j]] = sz; } } if (seqs.size() + 1 >= sizeof(slot_chars)) { break; } @@ -1616,11 +2584,11 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { c_curr = view.cells; cs_curr = view.cells_sequences; - for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) { + for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) { if (i % row_size == 0) { printf("\n%5d: ", i); } - for (int j = 0; j < view.n_max_seq; j++) { + for (int j = 0; j < view.n_seq_max; j++) { if (cs_curr[j] >= 0) { const auto & it = seqs.find(cs_curr[j]); putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+'); @@ -1633,3 +2601,188 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } + +void llama_embd_normalize(const float * inp, float * out, int n) { + double sum = 0.0; + for (int i = 0; i < n; i++) { + sum += inp[i] * inp[i]; + } + sum = sqrt(sum); + + const float norm = sum > 0.0 ? 1.0f / sum : 0.0f; + + for (int i = 0; i < n; i++) { + out[i] = inp[i] * norm; + } +} + +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){ + double sum = 0.0; + double sum1 = 0.0; + double sum2 = 0.0; + + for (int i = 0; i < n; i++) { + sum += embd1[i] * embd2[i]; + sum1 += embd1[i] * embd1[i]; + sum2 += embd2[i] * embd2[i]; + } + + return sum / (sqrt(sum1) * sqrt(sum2)); +} + +// +// Control vector utils +// + +static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + int32_t n_tensors; + + size_t n_bytes = 0; + + uint32_t max_direction_layer = 0; + + llama_control_vector_data result = { -1, {} }; + + // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer + { + struct ggml_init_params meta_params = { + /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ true, + }; + ggml_context * meta_ctx = ggml_init(meta_params); + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ true, + /* .ctx = */ &meta_ctx, + }; + struct gguf_context * meta_ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params); + if (!meta_ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + return result; + } + + n_tensors = gguf_get_n_tensors(meta_ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + std::string name = gguf_get_tensor_name(meta_ctx_gguf, i); + + // split on '.' + size_t dotpos = name.find('.'); + if (dotpos != std::string::npos && name.substr(0, dotpos) == "direction") { + try { + uint32_t layer = std::stoi(name.substr(dotpos + 1)); + if (layer == 0) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (layer > max_direction_layer) { + max_direction_layer = layer; + } + } catch (...) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + } + + struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); + if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { + fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + if (result.n_embd == -1) { + result.n_embd = ggml_nelements(tensor_meta); + } else if (ggml_nelements(tensor_meta) != result.n_embd) { + fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + return result; + } + n_bytes += ggml_nbytes(tensor_meta); + } + ggml_free(meta_ctx); + gguf_free(meta_ctx_gguf); + } + + if (n_tensors == 0) { + fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str()); + return result; + } + + // load and scale tensors into final control vector context + struct ggml_init_params ggml_params = { + /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, + /* .mem_buffer = */ nullptr, + /* .no_alloc = */ false, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load control vector from %s\n", __func__, load_info.fname.c_str()); + ggml_free(ctx); + return result; + } + + // do not store data for layer 0 (it's not used) + result.data.resize(result.n_embd * max_direction_layer); + + for (uint32_t il = 1; il <= max_direction_layer; il++) { + const std::string name = "direction." + std::to_string(il); + const ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str()); + + float * dst = result.data.data() + result.n_embd * (il - 1); + + if (tensor) { + const float * src = (const float *) tensor->data; + for (int j = 0; j < result.n_embd; j++) { + dst[j] = src[j] * load_info.strength; + } + } else { + for (int j = 0; j < result.n_embd; j++) { + dst[j] = 0.0f; + } + } + } + + return result; +} + +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + llama_control_vector_data result = { -1, {} }; + + for (const auto & info : load_infos) { + auto cur = llama_control_vector_load_one(info); + + if (cur.n_embd == -1) { + return result; + } + if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) { + fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str()); + return result; + } + + if (result.n_embd == -1) { + result = std::move(cur); + } else { + for (size_t i = 0; i < cur.data.size(); i++) { + result.data[i] += cur.data[i]; + } + } + } + + if (result.n_embd == -1) { + fprintf(stderr, "%s: no vectors passed\n", __func__); + } + + return result; +} diff --git a/common/common.h b/common/common.h index e2bbfc258..99ee90bc3 100644 --- a/common/common.h +++ b/common/common.h @@ -37,56 +37,73 @@ extern char const *LLAMA_COMMIT; extern char const *LLAMA_COMPILER; extern char const *LLAMA_BUILD_TARGET; +struct llama_control_vector_load_info; + +int32_t get_num_physical_cores(); + // // CLI argument parsing // -int32_t get_num_physical_cores(); struct gpt_params { - uint32_t seed = -1; // RNG seed + uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_threads = get_num_physical_cores(); - int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 512; // context size - int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 8; // number of tokens to draft during speculative decoding - int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) - int32_t n_parallel = 1; // number of parallel sequences to decode - int32_t n_sequences = 1; // number of sequences to decode - float p_accept = 0.5f; // speculative decoding accept probability - float p_split = 0.1f; // speculative decoding split probability - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs - int32_t n_beams = 0; // if non-zero then use beam search of given width. - int32_t grp_attn_n = 1; // group-attention factor - int32_t grp_attn_w = 512; // group-attention width - float rope_freq_base = 0.0f; // RoPE base frequency - float rope_freq_scale = 0.0f; // RoPE frequency scaling factor - float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor - float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor - float yarn_beta_fast = 32.0f; // YaRN low correction dim - float yarn_beta_slow = 1.0f; // YaRN high correction dim - int32_t yarn_orig_ctx = 0; // YaRN original context length - int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment - // pinging @cebtenzzre + int32_t n_threads = get_num_physical_cores(); + int32_t n_threads_draft = -1; + int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) + int32_t n_threads_batch_draft = -1; + int32_t n_predict = -1; // new tokens to predict + int32_t n_ctx = 512; // context size + int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_draft = 5; // number of tokens to draft during speculative decoding + int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) + int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_sequences = 1; // number of sequences to decode + float p_split = 0.1f; // speculative decoding split probability + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_beams = 0; // if non-zero then use beam search of given width. + int32_t grp_attn_n = 1; // group-attention factor + int32_t grp_attn_w = 512; // group-attention width + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) + float rope_freq_base = 0.0f; // RoPE base frequency + float rope_freq_scale = 0.0f; // RoPE frequency scaling factor + float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor + float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor + float yarn_beta_fast = 32.0f; // YaRN low correction dim + float yarn_beta_slow = 1.0f; // YaRN high correction dim + int32_t yarn_orig_ctx = 0; // YaRN original context length + float defrag_thold = -1.0f; // KV cache defragmentation threshold + + ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; + + llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; + llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings // // sampling parameters struct llama_sampling_params sparams; - std::string model = "models/7B/ggml-model-f16.gguf"; // model path - std::string model_draft = ""; // draft model for speculative decoding - std::string model_alias = "unknown"; // model alias - std::string prompt = ""; - std::string prompt_file = ""; // store the external prompt file name - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with + std::string model = "models/7B/ggml-model-f16.gguf"; // model path + std::string model_draft = ""; // draft model for speculative decoding + std::string model_alias = "unknown"; // model alias + std::string model_url = ""; // model url to download + std::string hf_repo = ""; // HF repo + std::string hf_file = ""; // HF file + std::string prompt = ""; + std::string prompt_file = ""; // store the external prompt file name + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted - std::string logdir = ""; // directory in which to save YAML log files + std::string logdir = ""; // directory in which to save YAML log files + std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding + std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding + std::string logits_file = ""; // file for saving *all* logits std::vector kv_overrides; @@ -94,6 +111,11 @@ struct gpt_params { std::vector> lora_adapter; // lora adapter path with user defined scale std::string lora_base = ""; // base model path for the lora adapter + std::vector control_vectors; // control vector with user defined scale + + int32_t control_vector_layer_start = -1; // layer range for control vector + int32_t control_vector_layer_end = -1; // layer range for control vector + int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line // (which is more convenient to use for plotting) @@ -101,7 +123,14 @@ struct gpt_params { bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score - bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS + bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt + size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed + + bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt + size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed + + bool kl_divergence = false; // compute KL-divergence + bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode @@ -114,7 +143,7 @@ struct gpt_params { bool interactive_first = false; // wait for user input immediately bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles - bool cont_batching = false; // insert new sequences for decoding on-the-fly + bool cont_batching = true; // insert new sequences for decoding on-the-fly bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool ignore_eos = false; // ignore generated EOS tokens @@ -122,8 +151,8 @@ struct gpt_params { bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory - bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation + bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading @@ -142,6 +171,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params); +bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); + std::string get_system_info(const gpt_params & params); std::string gpt_random_prompt(std::mt19937 & rng); @@ -149,10 +180,13 @@ std::string gpt_random_prompt(std::mt19937 & rng); void process_escapes(std::string& input); // -// String parsing +// String utils // -std::string parse_samplers_input(std::string input); +std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names); +std::vector sampler_types_from_chars(const std::string & names_string); +std::vector string_split(std::string input, char separator); +std::string sampler_type_to_name_string(llama_sampler_type sampler_type); // // Model utils @@ -164,6 +198,9 @@ std::tuple llama_init_from_gpt_par struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); +struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params); +struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params); + // Batch utils void llama_batch_clear(struct llama_batch & batch); @@ -243,3 +280,38 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); +// +// Embedding utils +// + +void llama_embd_normalize(const float * inp, float * out, int n); + +float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n); + +// +// Control vector utils +// + +struct llama_control_vector_data { + int n_embd; + + // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd + std::vector data; +}; + +struct llama_control_vector_load_info { + float strength; + + std::string fname; +}; + +// Load control vectors, scale each by strength, and add them together. +// On error, returns {-1, empty} +llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); + +// +// Split utils +// +static const char * const LLM_KV_SPLIT_NO = "split.no"; +static const char * const LLM_KV_SPLIT_COUNT = "split.count"; +static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index bf89a96f3..2a1301569 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -278,6 +278,22 @@ namespace grammar_parser { while (*pos) { pos = parse_rule(state, pos); } + // Validate the state to ensure that all rules are defined + for (const auto & rule : state.rules) { + for (const auto & elem : rule) { + if (elem.type == LLAMA_GRETYPE_RULE_REF) { + // Ensure that the rule at that location exists + if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) { + // Get the name of the rule that is missing + for (const auto & kv : state.symbol_ids) { + if (kv.second == elem.value) { + throw std::runtime_error("Undefined rule identifier '" + kv.first + "'"); + } + } + } + } + } + } return state; } catch (const std::exception & err) { fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp new file mode 100644 index 000000000..0e4680346 --- /dev/null +++ b/common/json-schema-to-grammar.cpp @@ -0,0 +1,721 @@ +#include "json-schema-to-grammar.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using json = nlohmann::ordered_json; + +const std::string SPACE_RULE = "\" \"?"; + +std::unordered_map PRIMITIVE_RULES = { + {"boolean", "(\"true\" | \"false\") space"}, + {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"}, + {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"}, + {"value", "object | array | string | number | boolean"}, + {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"}, + {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"}, + {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " + "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " + "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " + "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " + "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"}, + {"string", " \"\\\"\" (\n" + " [^\"\\\\] |\n" + " \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n" + " )* \"\\\"\" space"}, + {"null", "\"null\" space"} +}; +std::vector OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"}; + +std::unordered_map DATE_RULES = { + {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"}, + {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"}, + {"date-time", "date \"T\" time"}, + {"date-string", "\"\\\"\" date \"\\\"\" space"}, + {"time-string", "\"\\\"\" time \"\\\"\" space"}, + {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"} +}; + +static bool is_reserved_name(const std::string & name) { + static std::unordered_set RESERVED_NAMES; + if (RESERVED_NAMES.empty()) { + RESERVED_NAMES.insert("root"); + for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); + for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first); + } + return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); +} + +std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+"); +std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]"); +std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]"); +std::unordered_map GRAMMAR_LITERAL_ESCAPES = { + {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"} +}; + +std::unordered_set NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'}; +std::unordered_set ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'}; + +template +std::string join(Iterator begin, Iterator end, const std::string & separator) { + std::ostringstream result; + if (begin != end) { + result << *begin; + for (Iterator it = begin + 1; it != end; ++it) { + result << separator << *it; + } + } + return result.str(); +} + +static std::vector split(const std::string & str, const std::string & delimiter) { + std::vector tokens; + size_t start = 0; + size_t end = str.find(delimiter); + + while (end != std::string::npos) { + tokens.push_back(str.substr(start, end - start)); + start = end + delimiter.length(); + end = str.find(delimiter, start); + } + + tokens.push_back(str.substr(start)); + + return tokens; +} + +static std::string repeat(const std::string & str, size_t n) { + if (n == 0) { + return ""; + } + + std::string result; + result.reserve(str.length() * n); + + for (size_t i = 0; i < n; ++i) { + result += str; + } + + return result; +} + +static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function & replacement) { + std::smatch match; + std::string result; + + std::string::const_iterator searchStart(input.cbegin()); + std::string::const_iterator searchEnd(input.cend()); + + while (std::regex_search(searchStart, searchEnd, match, regex)) { + result.append(searchStart, searchStart + match.position()); + result.append(replacement(match)); + searchStart = match.suffix().first; + } + + result.append(searchStart, searchEnd); + + return result; +} + +static std::string format_literal(const std::string & literal) { + std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) { + char c = match.str()[0]; + return GRAMMAR_LITERAL_ESCAPES.at(c); + }); + return "\"" + escaped + "\""; +} + + +class SchemaConverter { +private: + std::function _fetch_json; + bool _dotall; + std::map _rules; + std::unordered_map _refs; + std::unordered_set _refs_being_resolved; + std::vector _errors; + std::vector _warnings; + + std::string _add_rule(const std::string & name, const std::string & rule) { + std::string esc_name = regex_replace(name, INVALID_RULE_CHARS_RE, "-"); + if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) { + _rules[esc_name] = rule; + return esc_name; + } else { + int i = 0; + while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) { + i++; + } + std::string key = esc_name + std::to_string(i); + _rules[key] = rule; + return key; + } + } + + std::string _generate_union_rule(const std::string & name, const std::vector & alt_schemas) { + std::vector rules; + for (size_t i = 0; i < alt_schemas.size(); i++) { + rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i))); + } + return join(rules.begin(), rules.end(), " | "); + } + + std::string _visit_pattern(const std::string & pattern, const std::string & name) { + if (!(pattern.front() == '^' && pattern.back() == '$')) { + _errors.push_back("Pattern must start with '^' and end with '$'"); + return ""; + } + std::string sub_pattern = pattern.substr(1, pattern.length() - 2); + std::unordered_map sub_rule_ids; + + size_t i = 0; + size_t length = sub_pattern.length(); + + using literal_or_rule = std::pair; + auto to_rule = [&](const literal_or_rule & ls) { + auto is_literal = ls.second; + auto s = ls.first; + return is_literal ? "\"" + s + "\"" : s; + }; + std::function transform = [&]() -> literal_or_rule { + size_t start = i; + std::vector seq; + + auto get_dot = [&]() { + std::string rule; + if (_dotall) { + rule = "[\\U00000000-\\U0010FFFF]"; + } else { + rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]"; + } + return _add_rule("dot", rule); + }; + + // Joins the sequence, merging consecutive literals together. + auto join_seq = [&]() { + std::vector ret; + + std::string literal; + auto flush_literal = [&]() { + if (literal.empty()) { + return false; + } + ret.push_back(std::make_pair(literal, true)); + literal.clear(); + return true; + }; + + for (const auto & item : seq) { + auto is_literal = item.second; + if (is_literal) { + literal += item.first; + } else { + flush_literal(); + ret.push_back(item); + } + } + flush_literal(); + + std::vector results; + for (const auto & item : ret) { + results.push_back(to_rule(item)); + } + return std::make_pair(join(results.begin(), results.end(), " "), false); + }; + + while (i < length) { + char c = sub_pattern[i]; + if (c == '.') { + seq.push_back(std::make_pair(get_dot(), false)); + i++; + } else if (c == '(') { + i++; + if (i < length) { + if (sub_pattern[i] == '?') { + _warnings.push_back("Unsupported pattern syntax"); + } + } + seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false)); + } else if (c == ')') { + i++; + if (start > 0 && sub_pattern[start - 1] != '(') { + _errors.push_back("Unbalanced parentheses"); + } + return join_seq(); + } else if (c == '[') { + std::string square_brackets = std::string(1, c); + i++; + while (i < length && sub_pattern[i] != ']') { + if (sub_pattern[i] == '\\') { + square_brackets += sub_pattern.substr(i, 2); + i += 2; + } else { + square_brackets += sub_pattern[i]; + i++; + } + } + if (i >= length) { + _errors.push_back("Unbalanced square brackets"); + } + square_brackets += ']'; + i++; + seq.push_back(std::make_pair(square_brackets, false)); + } else if (c == '|') { + seq.push_back(std::make_pair("|", false)); + i++; + } else if (c == '*' || c == '+' || c == '?') { + seq.back() = std::make_pair(to_rule(seq.back()) + c, false); + i++; + } else if (c == '{') { + std::string curly_brackets = std::string(1, c); + i++; + while (i < length && sub_pattern[i] != '}') { + curly_brackets += sub_pattern[i]; + i++; + } + if (i >= length) { + _errors.push_back("Unbalanced curly brackets"); + } + curly_brackets += '}'; + i++; + auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ","); + int min_times = 0; + int max_times = std::numeric_limits::max(); + try { + if (nums.size() == 1) { + min_times = max_times = std::stoi(nums[0]); + } else if (nums.size() != 2) { + _errors.push_back("Wrong number of values in curly brackets"); + } else { + if (!nums[0].empty()) { + min_times = std::stoi(nums[0]); + } + if (!nums[1].empty()) { + max_times = std::stoi(nums[1]); + } + } + } catch (const std::invalid_argument & e) { + _errors.push_back("Invalid number in curly brackets"); + return std::make_pair("", false); + } + auto &last = seq.back(); + auto &sub = last.first; + auto sub_is_literal = last.second; + + if (min_times == 0 && max_times == std::numeric_limits::max()) { + sub += "*"; + } else if (min_times == 0 && max_times == 1) { + sub += "?"; + } else if (min_times == 1 && max_times == std::numeric_limits::max()) { + sub += "+"; + } else { + if (!sub_is_literal) { + std::string & sub_id = sub_rule_ids[sub]; + if (sub_id.empty()) { + sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub); + } + sub = sub_id; + } + std::string result; + if (sub_is_literal && min_times > 0) { + result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\""; + } else { + for (int j = 0; j < min_times; j++) { + if (j > 0) { + result += " "; + } + result += sub; + } + } + if (min_times > 0 && min_times < max_times) { + result += " "; + } + if (max_times == std::numeric_limits::max()) { + result += sub + "*"; + } else { + for (int j = min_times; j < max_times; j++) { + if (j > min_times) { + result += " "; + } + result += sub + "?"; + } + } + seq.back().first = result; + seq.back().second = false; + } + } else { + std::string literal; + auto is_non_literal = [&](char c) { + return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end(); + }; + while (i < length) { + if (sub_pattern[i] == '\\' && i < length - 1) { + char next = sub_pattern[i + 1]; + if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) { + i++; + literal += sub_pattern[i]; + i++; + } else { + literal += sub_pattern.substr(i, 2); + i += 2; + } + } else if (sub_pattern[i] == '"') { + literal += "\\\""; + i++; + } else if (!is_non_literal(sub_pattern[i]) && + (i == length - 1 || literal.empty() || sub_pattern[i + 1] == '.' || !is_non_literal(sub_pattern[i + 1]))) { + literal += sub_pattern[i]; + i++; + } else { + break; + } + } + if (!literal.empty()) { + seq.push_back(std::make_pair(literal, true)); + } + } + } + return join_seq(); + }; + return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); + } + + std::string _resolve_ref(const std::string & ref) { + std::string ref_name = ref.substr(ref.find_last_of('/') + 1); + if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) { + _refs_being_resolved.insert(ref); + json resolved = _refs[ref]; + ref_name = visit(resolved, ref_name); + _refs_being_resolved.erase(ref); + } + return ref_name; + } + + std::string _build_object_rule( + const std::vector> & properties, + const std::unordered_set & required, + const std::string & name, + const json & additional_properties) + { + std::vector required_props; + std::vector optional_props; + std::unordered_map prop_kv_rule_names; + for (const auto & kv : properties) { + const auto &prop_name = kv.first; + const auto &prop_schema = kv.second; + + std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name); + prop_kv_rule_names[prop_name] = _add_rule( + name + (name.empty() ? "" : "-") + prop_name + "-kv", + format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name + ); + if (required.find(prop_name) != required.end()) { + required_props.push_back(prop_name); + } else { + optional_props.push_back(prop_name); + } + } + if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get())) { + std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; + std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value"); + std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule); + prop_kv_rule_names["*"] = kv_rule; + optional_props.push_back("*"); + } + + std::string rule = "\"{\" space "; + for (size_t i = 0; i < required_props.size(); i++) { + if (i > 0) { + rule += " \",\" space "; + } + rule += prop_kv_rule_names[required_props[i]]; + } + + if (!optional_props.empty()) { + rule += " ("; + if (!required_props.empty()) { + rule += " \",\" space ( "; + } + + std::function &, bool)> get_recursive_refs = [&](const std::vector & ks, bool first_is_optional) { + std::string res; + if (ks.empty()) { + return res; + } + std::string k = ks[0]; + std::string kv_rule_name = prop_kv_rule_names[k]; + if (k == "*") { + res = _add_rule( + name + (name.empty() ? "" : "-") + "additional-kvs", + kv_rule_name + " ( \",\" space " + kv_rule_name + " )*" + ); + } else if (first_is_optional) { + res = "( \",\" space " + kv_rule_name + " )?"; + } else { + res = kv_rule_name; + } + if (ks.size() > 1) { + res += " " + _add_rule( + name + (name.empty() ? "" : "-") + k + "-rest", + get_recursive_refs(std::vector(ks.begin() + 1, ks.end()), true) + ); + } + return res; + }; + + for (size_t i = 0; i < optional_props.size(); i++) { + if (i > 0) { + rule += " | "; + } + rule += get_recursive_refs(std::vector(optional_props.begin() + i, optional_props.end()), false); + } + if (!required_props.empty()) { + rule += " )"; + } + rule += " )?"; + } + + rule += " \"}\" space"; + + return rule; + } + +public: + SchemaConverter( + const std::function & fetch_json, + bool dotall) + : _fetch_json(fetch_json), _dotall(dotall) + { + _rules["space"] = SPACE_RULE; + } + + void resolve_refs(json & schema, const std::string & url) { + /* + * Resolves all $ref fields in the given schema, fetching any remote schemas, + * replacing each $ref with absolute reference URL and populates _refs with the + * respective referenced (sub)schema dictionaries. + */ + std::function visit_refs = [&](json & n) { + if (n.is_array()) { + for (auto & x : n) { + visit_refs(x); + } + } else if (n.is_object()) { + if (n.contains("$ref")) { + std::string ref = n["$ref"]; + if (_refs.find(ref) == _refs.end()) { + json target; + if (ref.find("https://") == 0) { + std::string base_url = ref.substr(0, ref.find('#')); + auto it = _refs.find(base_url); + if (it != _refs.end()) { + target = it->second; + } else { + // Fetch the referenced schema and resolve its refs + auto referenced = _fetch_json(ref); + resolve_refs(referenced, base_url); + _refs[base_url] = referenced; + } + if (ref.find('#') == std::string::npos || ref.substr(ref.find('#') + 1).empty()) { + return; + } + } else if (ref.find("#/") == 0) { + target = schema; + n["$ref"] = url + ref; + ref = url + ref; + } else { + _errors.push_back("Unsupported ref: " + ref); + return; + } + std::string pointer = ref.substr(ref.find('#') + 1); + std::vector tokens = split(pointer, "/"); + for (size_t i = 1; i < tokens.size(); ++i) { + std::string sel = tokens[i]; + if (target.is_null() || !target.contains(sel)) { + _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump()); + return; + } + target = target[sel]; + } + _refs[ref] = target; + } + } else { + for (auto & kv : n.items()) { + visit_refs(kv.value()); + } + } + } + }; + + visit_refs(schema); + } + + std::string _generate_constant_rule(const json & value) { + return format_literal(value.dump()); + } + + std::string visit(const json & schema, const std::string & name) { + json schema_type = schema.contains("type") ? schema["type"] : json(); + std::string schema_format = schema.contains("format") ? schema["format"].get() : ""; + std::string rule_name = is_reserved_name(name) ? name + "-" : name.empty() ? "root" : name; + + if (schema.contains("$ref")) { + return _add_rule(rule_name, _resolve_ref(schema["$ref"])); + } else if (schema.contains("oneOf") || schema.contains("anyOf")) { + std::vector alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get>() : schema["anyOf"].get>(); + return _add_rule(rule_name, _generate_union_rule(name, alt_schemas)); + } else if (schema_type.is_array()) { + std::vector schema_types; + for (const auto & t : schema_type) { + schema_types.push_back({{"type", t}}); + } + return _add_rule(rule_name, _generate_union_rule(name, schema_types)); + } else if (schema.contains("const")) { + return _add_rule(rule_name, _generate_constant_rule(schema["const"])); + } else if (schema.contains("enum")) { + std::vector enum_values; + for (const auto & v : schema["enum"]) { + enum_values.push_back(_generate_constant_rule(v)); + } + return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | ")); + } else if ((schema_type.is_null() || schema_type == "object") + && (schema.contains("properties") || + (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) { + std::unordered_set required; + if (schema.contains("required") && schema["required"].is_array()) { + for (const auto & item : schema["required"]) { + if (item.is_string()) { + required.insert(item.get()); + } + } + } + std::vector> properties; + if (schema.contains("properties")) { + for (const auto & prop : schema["properties"].items()) { + properties.emplace_back(prop.key(), prop.value()); + } + } + return _add_rule(rule_name, + _build_object_rule( + properties, required, name, + schema.contains("additionalProperties") ? schema["additionalProperties"] : json())); + } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) { + std::unordered_set required; + std::vector> properties; + std::string hybrid_name = name; + std::function add_component = [&](const json & comp_schema, bool is_required) { + if (comp_schema.contains("$ref")) { + add_component(_refs[comp_schema["$ref"]], is_required); + } else if (comp_schema.contains("properties")) { + for (const auto & prop : comp_schema["properties"].items()) { + properties.emplace_back(prop.key(), prop.value()); + if (is_required) { + required.insert(prop.key()); + } + } + } else { + // todo warning + } + }; + for (auto & t : schema["allOf"]) { + if (t.contains("anyOf")) { + for (auto & tt : t["anyOf"]) { + add_component(tt, false); + } + } else { + add_component(t, true); + } + } + return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json())); + } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) { + json items = schema.contains("items") ? schema["items"] : schema["prefixItems"]; + if (items.is_array()) { + std::string rule = "\"[\" space "; + for (size_t i = 0; i < items.size(); i++) { + if (i > 0) { + rule += " \",\" space "; + } + rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i)); + } + rule += " \"]\" space"; + return _add_rule(rule_name, rule); + } else { + std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item"); + std::string list_item_operator = "( \",\" space " + item_rule_name + " )"; + std::string successive_items; + int min_items = schema.contains("minItems") ? schema["minItems"].get() : 0; + json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json(); + int max_items = max_items_json.is_number_integer() ? max_items_json.get() : -1; + if (min_items > 0) { + successive_items += repeat(list_item_operator, min_items - 1); + min_items--; + } + if (max_items >= 0 && max_items > min_items) { + successive_items += repeat(list_item_operator + "?", max_items - min_items - 1); + } else { + successive_items += list_item_operator + "*"; + } + std::string rule; + if (min_items == 0) { + rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space"; + } else { + rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space"; + } + return _add_rule(rule_name, rule); + } + } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) { + return _visit_pattern(schema["pattern"], rule_name); + } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) { + return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid")); + } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) { + for (const auto & kv : DATE_RULES) { + _add_rule(kv.first, kv.second); + } + return schema_format + "-string"; + } else if (schema.empty() || schema_type == "object") { + for (const auto & n : OBJECT_RULE_NAMES) { + _add_rule(n, PRIMITIVE_RULES.at(n)); + } + return _add_rule(rule_name, "object"); + } else { + if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get()) == PRIMITIVE_RULES.end()) { + _errors.push_back("Unrecognized schema: " + schema.dump()); + return ""; + } + // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero + return _add_rule(rule_name == "root" ? "root" : schema_type.get(), PRIMITIVE_RULES.at(schema_type.get())); + } + } + + void check_errors() { + if (!_errors.empty()) { + throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n")); + } + if (!_warnings.empty()) { + fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str()); + } + } + + std::string format_grammar() { + std::stringstream ss; + for (const auto & kv : _rules) { + ss << kv.first << " ::= " << kv.second << std::endl; + } + return ss.str(); + } +}; + +std::string json_schema_to_grammar(const json & schema) { + SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false); + auto copy = schema; + converter.resolve_refs(copy, "input"); + converter.visit(copy, ""); + converter.check_errors(); + return converter.format_grammar(); +} diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h new file mode 100644 index 000000000..e1abed303 --- /dev/null +++ b/common/json-schema-to-grammar.h @@ -0,0 +1,4 @@ +#pragma once +#include "json.hpp" + +std::string json_schema_to_grammar(const nlohmann::ordered_json& schema); diff --git a/examples/server/json.hpp b/common/json.hpp similarity index 93% rename from examples/server/json.hpp rename to common/json.hpp index ea945f346..a858728c4 100644 --- a/examples/server/json.hpp +++ b/common/json.hpp @@ -1,9 +1,9 @@ // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT /****************************************************************************\ @@ -27,7 +27,6 @@ #endif // JSON_NO_IO #include // random_access_iterator_tag #include // unique_ptr -#include // accumulate #include // string, stoi, to_string #include // declval, forward, move, pair, swap #include // vector @@ -35,10 +34,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -48,10 +47,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -60,7 +59,7 @@ #ifndef JSON_SKIP_LIBRARY_VERSION_CHECK #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH) - #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 2 + #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3 #warning "Already included a different version of the library!" #endif #endif @@ -68,7 +67,7 @@ #define NLOHMANN_JSON_VERSION_MAJOR 3 // NOLINT(modernize-macro-to-enum) #define NLOHMANN_JSON_VERSION_MINOR 11 // NOLINT(modernize-macro-to-enum) -#define NLOHMANN_JSON_VERSION_PATCH 2 // NOLINT(modernize-macro-to-enum) +#define NLOHMANN_JSON_VERSION_PATCH 3 // NOLINT(modernize-macro-to-enum) #ifndef JSON_DIAGNOSTICS #define JSON_DIAGNOSTICS 0 @@ -150,10 +149,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -173,16 +172,19 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT #include // nullptr_t #include // exception +#if JSON_DIAGNOSTICS + #include // accumulate +#endif #include // runtime_error #include // to_string #include // vector @@ -190,10 +192,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -206,10 +208,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -218,10 +220,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -231,10 +233,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -318,10 +320,10 @@ NLOHMANN_JSON_NAMESPACE_END // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-FileCopyrightText: 2016-2021 Evan Nemerson // SPDX-License-Identifier: MIT @@ -2483,6 +2485,14 @@ JSON_HEDLEY_DIAGNOSTIC_POP #endif #endif +#ifndef JSON_HAS_STATIC_RTTI + #if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0 + #define JSON_HAS_STATIC_RTTI 1 + #else + #define JSON_HAS_STATIC_RTTI 0 + #endif +#endif + #ifdef JSON_HAS_CPP_17 #define JSON_INLINE_VARIABLE inline #else @@ -2590,12 +2600,13 @@ JSON_HEDLEY_DIAGNOSTIC_POP class NumberUnsignedType, class NumberFloatType, \ template class AllocatorType, \ template class JSONSerializer, \ - class BinaryType> + class BinaryType, \ + class CustomBaseClass> #define NLOHMANN_BASIC_JSON_TPL \ basic_json + AllocatorType, JSONSerializer, BinaryType, CustomBaseClass> // Macros to simplify conversion from/to types @@ -2745,7 +2756,10 @@ JSON_HEDLEY_DIAGNOSTIC_POP #define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...) \ friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ - friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { Type nlohmann_json_default_obj; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } + friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } + +#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } /*! @brief macro @@ -2756,10 +2770,12 @@ JSON_HEDLEY_DIAGNOSTIC_POP inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } +#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } + #define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...) \ inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ - inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { Type nlohmann_json_default_obj; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } - + inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } // inspired from https://stackoverflow.com/a/26745591 // allows to call any std function as if (e.g. with begin): @@ -2923,10 +2939,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -2998,10 +3014,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3040,10 +3056,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-FileCopyrightText: 2018 The Abseil Authors // SPDX-License-Identifier: MIT @@ -3214,10 +3230,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3226,14 +3242,15 @@ NLOHMANN_JSON_NAMESPACE_END #include // false_type, is_constructible, is_integral, is_same, true_type #include // declval #include // tuple +#include // char_traits // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3298,10 +3315,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3318,10 +3335,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3342,10 +3359,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT #ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ @@ -3389,7 +3406,8 @@ NLOHMANN_JSON_NAMESPACE_END template class AllocatorType = std::allocator, template class JSONSerializer = adl_serializer, - class BinaryType = std::vector> + class BinaryType = std::vector, // cppcheck-suppress syntaxError + class CustomBaseClass = void> class basic_json; /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document @@ -3577,6 +3595,63 @@ struct actual_object_comparator template using actual_object_comparator_t = typename actual_object_comparator::type; +///////////////// +// char_traits // +///////////////// + +// Primary template of char_traits calls std char_traits +template +struct char_traits : std::char_traits +{}; + +// Explicitly define char traits for unsigned char since it is not standard +template<> +struct char_traits : std::char_traits +{ + using char_type = unsigned char; + using int_type = uint64_t; + + // Redefine to_int_type function + static int_type to_int_type(char_type c) noexcept + { + return static_cast(c); + } + + static char_type to_char_type(int_type i) noexcept + { + return static_cast(i); + } + + static constexpr int_type eof() noexcept + { + return static_cast(EOF); + } +}; + +// Explicitly define char traits for signed char since it is not standard +template<> +struct char_traits : std::char_traits +{ + using char_type = signed char; + using int_type = uint64_t; + + // Redefine to_int_type function + static int_type to_int_type(char_type c) noexcept + { + return static_cast(c); + } + + static char_type to_char_type(int_type i) noexcept + { + return static_cast(i); + } + + static constexpr int_type eof() noexcept + { + return static_cast(EOF); + } +}; + /////////////////// // is_ functions // /////////////////// @@ -3613,7 +3688,6 @@ template struct is_default_constructible> : conjunction...> {}; - template struct is_constructible : std::is_constructible {}; @@ -3629,7 +3703,6 @@ struct is_constructible> : is_default_constructible struct is_constructible> : is_default_constructible> {}; - template struct is_iterator_traits : std::false_type {}; @@ -4039,7 +4112,6 @@ struct value_in_range_of_impl2 } }; - template struct value_in_range_of_impl2 { @@ -4138,10 +4210,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -4165,28 +4237,28 @@ inline std::size_t concat_length() } template -inline std::size_t concat_length(const char* cstr, Args&& ... rest); +inline std::size_t concat_length(const char* cstr, const Args& ... rest); template -inline std::size_t concat_length(const StringType& str, Args&& ... rest); +inline std::size_t concat_length(const StringType& str, const Args& ... rest); template -inline std::size_t concat_length(const char /*c*/, Args&& ... rest) +inline std::size_t concat_length(const char /*c*/, const Args& ... rest) { - return 1 + concat_length(std::forward(rest)...); + return 1 + concat_length(rest...); } template -inline std::size_t concat_length(const char* cstr, Args&& ... rest) +inline std::size_t concat_length(const char* cstr, const Args& ... rest) { // cppcheck-suppress ignoredReturnValue - return ::strlen(cstr) + concat_length(std::forward(rest)...); + return ::strlen(cstr) + concat_length(rest...); } template -inline std::size_t concat_length(const StringType& str, Args&& ... rest) +inline std::size_t concat_length(const StringType& str, const Args& ... rest) { - return str.size() + concat_length(std::forward(rest)...); + return str.size() + concat_length(rest...); } template @@ -4277,7 +4349,7 @@ template inline OutStringType concat(Args && ... args) { OutStringType str; - str.reserve(concat_length(std::forward(args)...)); + str.reserve(concat_length(args...)); concat_into(str, std::forward(args)...); return str; } @@ -4286,7 +4358,6 @@ inline OutStringType concat(Args && ... args) NLOHMANN_JSON_NAMESPACE_END - NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail { @@ -4334,9 +4405,9 @@ class exception : public std::exception { case value_t::array: { - for (std::size_t i = 0; i < current->m_parent->m_value.array->size(); ++i) + for (std::size_t i = 0; i < current->m_parent->m_data.m_value.array->size(); ++i) { - if (¤t->m_parent->m_value.array->operator[](i) == current) + if (¤t->m_parent->m_data.m_value.array->operator[](i) == current) { tokens.emplace_back(std::to_string(i)); break; @@ -4347,7 +4418,7 @@ class exception : public std::exception case value_t::object: { - for (const auto& element : *current->m_parent->m_value.object) + for (const auto& element : *current->m_parent->m_data.m_value.object) { if (&element.second == current) { @@ -4410,17 +4481,17 @@ class parse_error : public exception template::value, int> = 0> static parse_error create(int id_, const position_t& pos, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("parse_error", id_), "parse error", - position_string(pos), ": ", exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("parse_error", id_), "parse error", + position_string(pos), ": ", exception::diagnostics(context), what_arg); return {id_, pos.chars_read_total, w.c_str()}; } template::value, int> = 0> static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("parse_error", id_), "parse error", - (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""), - ": ", exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("parse_error", id_), "parse error", + (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""), + ": ", exception::diagnostics(context), what_arg); return {id_, byte_, w.c_str()}; } @@ -4454,7 +4525,7 @@ class invalid_iterator : public exception template::value, int> = 0> static invalid_iterator create(int id_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg); return {id_, w.c_str()}; } @@ -4472,7 +4543,7 @@ class type_error : public exception template::value, int> = 0> static type_error create(int id_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg); return {id_, w.c_str()}; } @@ -4489,7 +4560,7 @@ class out_of_range : public exception template::value, int> = 0> static out_of_range create(int id_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg); return {id_, w.c_str()}; } @@ -4506,7 +4577,7 @@ class other_error : public exception template::value, int> = 0> static other_error create(int id_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg); return {id_, w.c_str()}; } @@ -4525,10 +4596,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -4549,10 +4620,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -5055,10 +5126,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -5075,10 +5146,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -5147,10 +5218,10 @@ template class iteration_proxy_value // older GCCs are a bit fussy and require explicit noexcept specifiers on defaulted functions iteration_proxy_value(iteration_proxy_value&&) noexcept(std::is_nothrow_move_constructible::value - && std::is_nothrow_move_constructible::value) = default; + && std::is_nothrow_move_constructible::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations) iteration_proxy_value& operator=(iteration_proxy_value&&) noexcept(std::is_nothrow_move_assignable::value - && std::is_nothrow_move_assignable::value) = default; + && std::is_nothrow_move_assignable::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations) ~iteration_proxy_value() = default; /// dereference operator (needed for range-based for) @@ -5297,11 +5368,11 @@ namespace std #pragma clang diagnostic ignored "-Wmismatched-tags" #endif template -class tuple_size<::nlohmann::detail::iteration_proxy_value> +class tuple_size<::nlohmann::detail::iteration_proxy_value> // NOLINT(cert-dcl58-cpp) : public std::integral_constant {}; template -class tuple_element> +class tuple_element> // NOLINT(cert-dcl58-cpp) { public: using type = decltype( @@ -5340,7 +5411,7 @@ namespace detail /* * Note all external_constructor<>::construct functions need to call - * j.m_value.destroy(j.m_type) to avoid a memory leak in case j contains an + * j.m_data.m_value.destroy(j.m_data.m_type) to avoid a memory leak in case j contains an * allocated value (e.g., a string). See bug issue * https://github.com/nlohmann/json/issues/2865 for more information. */ @@ -5353,9 +5424,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept { - j.m_value.destroy(j.m_type); - j.m_type = value_t::boolean; - j.m_value = b; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::boolean; + j.m_data.m_value = b; j.assert_invariant(); } }; @@ -5366,18 +5437,18 @@ struct external_constructor template static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::string; - j.m_value = s; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::string; + j.m_data.m_value = s; j.assert_invariant(); } template static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::string; - j.m_value = std::move(s); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::string; + j.m_data.m_value = std::move(s); j.assert_invariant(); } @@ -5386,9 +5457,9 @@ struct external_constructor int > = 0 > static void construct(BasicJsonType& j, const CompatibleStringType& str) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::string; - j.m_value.string = j.template create(str); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::string; + j.m_data.m_value.string = j.template create(str); j.assert_invariant(); } }; @@ -5399,18 +5470,18 @@ struct external_constructor template static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::binary; - j.m_value = typename BasicJsonType::binary_t(b); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::binary; + j.m_data.m_value = typename BasicJsonType::binary_t(b); j.assert_invariant(); } template static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::binary; - j.m_value = typename BasicJsonType::binary_t(std::move(b)); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::binary; + j.m_data.m_value = typename BasicJsonType::binary_t(std::move(b)); j.assert_invariant(); } }; @@ -5421,9 +5492,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept { - j.m_value.destroy(j.m_type); - j.m_type = value_t::number_float; - j.m_value = val; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::number_float; + j.m_data.m_value = val; j.assert_invariant(); } }; @@ -5434,9 +5505,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept { - j.m_value.destroy(j.m_type); - j.m_type = value_t::number_unsigned; - j.m_value = val; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::number_unsigned; + j.m_data.m_value = val; j.assert_invariant(); } }; @@ -5447,9 +5518,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept { - j.m_value.destroy(j.m_type); - j.m_type = value_t::number_integer; - j.m_value = val; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::number_integer; + j.m_data.m_value = val; j.assert_invariant(); } }; @@ -5460,9 +5531,9 @@ struct external_constructor template static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value = arr; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value = arr; j.set_parents(); j.assert_invariant(); } @@ -5470,9 +5541,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value = std::move(arr); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value = std::move(arr); j.set_parents(); j.assert_invariant(); } @@ -5485,9 +5556,9 @@ struct external_constructor using std::begin; using std::end; - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value.array = j.template create(begin(arr), end(arr)); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value.array = j.template create(begin(arr), end(arr)); j.set_parents(); j.assert_invariant(); } @@ -5495,14 +5566,14 @@ struct external_constructor template static void construct(BasicJsonType& j, const std::vector& arr) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value = value_t::array; - j.m_value.array->reserve(arr.size()); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value = value_t::array; + j.m_data.m_value.array->reserve(arr.size()); for (const bool x : arr) { - j.m_value.array->push_back(x); - j.set_parent(j.m_value.array->back()); + j.m_data.m_value.array->push_back(x); + j.set_parent(j.m_data.m_value.array->back()); } j.assert_invariant(); } @@ -5511,13 +5582,13 @@ struct external_constructor enable_if_t::value, int> = 0> static void construct(BasicJsonType& j, const std::valarray& arr) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value = value_t::array; - j.m_value.array->resize(arr.size()); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value = value_t::array; + j.m_data.m_value.array->resize(arr.size()); if (arr.size() > 0) { - std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin()); + std::copy(std::begin(arr), std::end(arr), j.m_data.m_value.array->begin()); } j.set_parents(); j.assert_invariant(); @@ -5530,9 +5601,9 @@ struct external_constructor template static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::object; - j.m_value = obj; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::object; + j.m_data.m_value = obj; j.set_parents(); j.assert_invariant(); } @@ -5540,9 +5611,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::object; - j.m_value = std::move(obj); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::object; + j.m_data.m_value = std::move(obj); j.set_parents(); j.assert_invariant(); } @@ -5554,9 +5625,9 @@ struct external_constructor using std::begin; using std::end; - j.m_value.destroy(j.m_type); - j.m_type = value_t::object; - j.m_value.object = j.template create(begin(obj), end(obj)); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::object; + j.m_data.m_value.object = j.template create(begin(obj), end(obj)); j.set_parents(); j.assert_invariant(); } @@ -5626,7 +5697,8 @@ template::type; - external_constructor::construct(j, static_cast(e)); + static constexpr value_t integral_value_t = std::is_unsigned::value ? value_t::number_unsigned : value_t::number_integer; + external_constructor::construct(j, static_cast(e)); } #endif // JSON_DISABLE_ENUM_SERIALIZATION @@ -5796,10 +5868,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -5908,10 +5980,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -6041,10 +6113,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -6067,10 +6139,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -6094,6 +6166,8 @@ NLOHMANN_JSON_NAMESPACE_END // #include +// #include + NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail @@ -6140,7 +6214,6 @@ class file_input_adapter std::FILE* m_file; }; - /*! Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at beginning of input. Does not support changing the underlying std::streambuf @@ -6214,16 +6287,16 @@ class iterator_input_adapter : current(std::move(first)), end(std::move(last)) {} - typename std::char_traits::int_type get_character() + typename char_traits::int_type get_character() { if (JSON_HEDLEY_LIKELY(current != end)) { - auto result = std::char_traits::to_int_type(*current); + auto result = char_traits::to_int_type(*current); std::advance(current, 1); return result; } - return std::char_traits::eof(); + return char_traits::eof(); } private: @@ -6239,7 +6312,6 @@ class iterator_input_adapter } }; - template struct wide_string_input_helper; @@ -6363,7 +6435,7 @@ struct wide_string_input_helper } }; -// Wraps another input apdater to convert wide character types into individual bytes. +// Wraps another input adapter to convert wide character types into individual bytes. template class wide_string_input_adapter { @@ -6408,7 +6480,6 @@ class wide_string_input_adapter std::size_t utf8_bytes_filled = 0; }; - template struct iterator_input_adapter_factory { @@ -6565,10 +6636,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -6710,7 +6781,6 @@ struct json_sax virtual ~json_sax() = default; }; - namespace detail { /*! @@ -6812,7 +6882,7 @@ class json_sax_dom_parser JSON_ASSERT(ref_stack.back()->is_object()); // add null at given key and store the reference for later - object_element = &(ref_stack.back()->m_value.object->operator[](val)); + object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val)); return true; } @@ -6887,8 +6957,8 @@ class json_sax_dom_parser if (ref_stack.back()->is_array()) { - ref_stack.back()->m_value.array->emplace_back(std::forward(v)); - return &(ref_stack.back()->m_value.array->back()); + ref_stack.back()->m_data.m_value.array->emplace_back(std::forward(v)); + return &(ref_stack.back()->m_data.m_value.array->back()); } JSON_ASSERT(ref_stack.back()->is_object()); @@ -7007,7 +7077,7 @@ class json_sax_dom_callback_parser // add discarded value at given key and store the reference for later if (keep && ref_stack.back()) { - object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded); + object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded); } return true; @@ -7092,7 +7162,7 @@ class json_sax_dom_callback_parser // remove discarded value if (!keep && !ref_stack.empty() && ref_stack.back()->is_array()) { - ref_stack.back()->m_value.array->pop_back(); + ref_stack.back()->m_data.m_value.array->pop_back(); } return true; @@ -7159,7 +7229,7 @@ class json_sax_dom_callback_parser if (ref_stack.empty()) { root = std::move(value); - return {true, &root}; + return {true, & root}; } // skip this value if we already decided to skip the parent @@ -7175,8 +7245,8 @@ class json_sax_dom_callback_parser // array if (ref_stack.back()->is_array()) { - ref_stack.back()->m_value.array->emplace_back(std::move(value)); - return {true, &(ref_stack.back()->m_value.array->back())}; + ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value)); + return {true, & (ref_stack.back()->m_data.m_value.array->back())}; } // object @@ -7201,9 +7271,9 @@ class json_sax_dom_callback_parser /// stack to model hierarchy of values std::vector ref_stack {}; /// stack to manage which values to keep - std::vector keep_stack {}; + std::vector keep_stack {}; // NOLINT(readability-redundant-member-init) /// stack to manage which object keys to keep - std::vector key_keep_stack {}; + std::vector key_keep_stack {}; // NOLINT(readability-redundant-member-init) /// helper to hold the reference for the next object element BasicJsonType* object_element = nullptr; /// whether a syntax error occurred @@ -7298,10 +7368,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -7322,6 +7392,8 @@ NLOHMANN_JSON_NAMESPACE_END // #include +// #include + NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail @@ -7416,7 +7488,7 @@ class lexer : public lexer_base using number_float_t = typename BasicJsonType::number_float_t; using string_t = typename BasicJsonType::string_t; using char_type = typename InputAdapterType::char_type; - using char_int_type = typename std::char_traits::int_type; + using char_int_type = typename char_traits::int_type; public: using token_type = typename lexer_base::token_type; @@ -7523,7 +7595,7 @@ class lexer : public lexer_base for (auto range = ranges.begin(); range != ranges.end(); ++range) { get(); - if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) + if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions) { add(current); } @@ -7566,7 +7638,7 @@ class lexer : public lexer_base switch (get()) { // end of file while parsing string - case std::char_traits::eof(): + case char_traits::eof(): { error_message = "invalid string: missing closing quote"; return token_type::parse_error; @@ -8155,7 +8227,7 @@ class lexer : public lexer_base { case '\n': case '\r': - case std::char_traits::eof(): + case char_traits::eof(): case '\0': return true; @@ -8172,7 +8244,7 @@ class lexer : public lexer_base { switch (get()) { - case std::char_traits::eof(): + case char_traits::eof(): case '\0': { error_message = "invalid comment; missing closing '*/'"; @@ -8601,10 +8673,10 @@ scan_number_done: token_type scan_literal(const char_type* literal_text, const std::size_t length, token_type return_type) { - JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); + JSON_ASSERT(char_traits::to_char_type(current) == literal_text[0]); for (std::size_t i = 1; i < length; ++i) { - if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) + if (JSON_HEDLEY_UNLIKELY(char_traits::to_char_type(get()) != literal_text[i])) { error_message = "invalid literal"; return token_type::parse_error; @@ -8622,7 +8694,7 @@ scan_number_done: { token_buffer.clear(); token_string.clear(); - token_string.push_back(std::char_traits::to_char_type(current)); + token_string.push_back(char_traits::to_char_type(current)); } /* @@ -8630,7 +8702,7 @@ scan_number_done: This function provides the interface to the used input adapter. It does not throw in case the input reached EOF, but returns a - `std::char_traits::eof()` in that case. Stores the scanned characters + `char_traits::eof()` in that case. Stores the scanned characters for use in error messages. @return character read from the input @@ -8650,9 +8722,9 @@ scan_number_done: current = ia.get_character(); } - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + if (JSON_HEDLEY_LIKELY(current != char_traits::eof())) { - token_string.push_back(std::char_traits::to_char_type(current)); + token_string.push_back(char_traits::to_char_type(current)); } if (current == '\n') @@ -8691,7 +8763,7 @@ scan_number_done: --position.chars_read_current_line; } - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + if (JSON_HEDLEY_LIKELY(current != char_traits::eof())) { JSON_ASSERT(!token_string.empty()); token_string.pop_back(); @@ -8885,7 +8957,7 @@ scan_number_done: // end of input (the null byte is needed when parsing from // string literals) case '\0': - case std::char_traits::eof(): + case char_traits::eof(): return token_type::end_of_input; // error @@ -8903,7 +8975,7 @@ scan_number_done: const bool ignore_comments = false; /// the current character - char_int_type current = std::char_traits::eof(); + char_int_type current = char_traits::eof(); /// whether the next get() call should just return current bool next_unget = false; @@ -8937,10 +9009,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -9129,7 +9201,6 @@ static inline bool little_endianness(int num = 1) noexcept return *reinterpret_cast(&num) == 1; } - /////////////////// // binary reader // /////////////////// @@ -9147,7 +9218,7 @@ class binary_reader using binary_t = typename BasicJsonType::binary_t; using json_sax_t = SAX; using char_type = typename InputAdapterType::char_type; - using char_int_type = typename std::char_traits::int_type; + using char_int_type = typename char_traits::int_type; public: /*! @@ -9220,7 +9291,7 @@ class binary_reader get(); } - if (JSON_HEDLEY_UNLIKELY(current != std::char_traits::eof())) + if (JSON_HEDLEY_UNLIKELY(current != char_traits::eof())) { return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read, exception_message(input_format, concat("expected end of input; last byte: 0x", get_token_string()), "value"), nullptr)); @@ -9303,7 +9374,7 @@ class binary_reader exception_message(input_format_t::bson, concat("string length must be at least 1, is ", std::to_string(len)), "string"), nullptr)); } - return get_string(input_format_t::bson, len - static_cast(1), result) && get() != std::char_traits::eof(); + return get_string(input_format_t::bson, len - static_cast(1), result) && get() != char_traits::eof(); } /*! @@ -9404,7 +9475,7 @@ class binary_reader { std::array cr{{}}; static_cast((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast(element_type))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) - std::string cr_str{cr.data()}; + const std::string cr_str{cr.data()}; return sax->parse_error(element_type_parse_position, cr_str, parse_error::create(114, element_type_parse_position, concat("Unsupported BSON record type 0x", cr_str), nullptr)); } @@ -9497,7 +9568,7 @@ class binary_reader switch (get_char ? get() : current) { // EOF - case std::char_traits::eof(): + case char_traits::eof(): return unexpect_eof(input_format_t::cbor, "value"); // Integer 0x00..0x17 (0..23) @@ -10272,7 +10343,7 @@ class binary_reader switch (get()) { // EOF - case std::char_traits::eof(): + case char_traits::eof(): return unexpect_eof(input_format_t::msgpack, "value"); // positive fixint @@ -11339,7 +11410,7 @@ class binary_reader exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr)); } - bool is_error = get_ubjson_size_value(result.first, is_ndarray); + const bool is_error = get_ubjson_size_value(result.first, is_ndarray); if (input_format == input_format_t::bjdata && is_ndarray) { if (inside_ndarray) @@ -11354,7 +11425,7 @@ class binary_reader if (current == '#') { - bool is_error = get_ubjson_size_value(result.first, is_ndarray); + const bool is_error = get_ubjson_size_value(result.first, is_ndarray); if (input_format == input_format_t::bjdata && is_ndarray) { return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read, @@ -11374,7 +11445,7 @@ class binary_reader { switch (prefix) { - case std::char_traits::eof(): // EOF + case char_traits::eof(): // EOF return unexpect_eof(input_format, "value"); case 'T': // true @@ -11819,7 +11890,7 @@ class binary_reader This function provides the interface to the used input adapter. It does not throw in case the input reached EOF, but returns a -'ve valued - `std::char_traits::eof()` in that case. + `char_traits::eof()` in that case. @return character read from the input */ @@ -11961,7 +12032,7 @@ class binary_reader JSON_HEDLEY_NON_NULL(3) bool unexpect_eof(const input_format_t format, const char* context) const { - if (JSON_HEDLEY_UNLIKELY(current == std::char_traits::eof())) + if (JSON_HEDLEY_UNLIKELY(current == char_traits::eof())) { return sax->parse_error(chars_read, "", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr)); @@ -12028,7 +12099,7 @@ class binary_reader InputAdapterType ia; /// the current character - char_int_type current = std::char_traits::eof(); + char_int_type current = char_traits::eof(); /// the number of characters read std::size_t chars_read = 0; @@ -12090,10 +12161,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -12439,13 +12510,25 @@ class parser m_lexer.get_token_string(), parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), nullptr)); } + case token_type::end_of_input: + { + if (JSON_HEDLEY_UNLIKELY(m_lexer.get_position().chars_read_total == 1)) + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), + "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr)); + } + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr)); + } case token_type::uninitialized: case token_type::end_array: case token_type::end_object: case token_type::name_separator: case token_type::value_separator: - case token_type::end_of_input: case token_type::literal_or_value: default: // the last token was unexpected { @@ -12607,10 +12690,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -12620,10 +12703,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -12779,10 +12862,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -12887,7 +12970,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { @@ -12984,17 +13067,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { - m_it.object_iterator = m_object->m_value.object->begin(); + m_it.object_iterator = m_object->m_data.m_value.object->begin(); break; } case value_t::array: { - m_it.array_iterator = m_object->m_value.array->begin(); + m_it.array_iterator = m_object->m_data.m_value.array->begin(); break; } @@ -13028,17 +13111,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { - m_it.object_iterator = m_object->m_value.object->end(); + m_it.object_iterator = m_object->m_data.m_value.object->end(); break; } case value_t::array: { - m_it.array_iterator = m_object->m_value.array->end(); + m_it.array_iterator = m_object->m_data.m_value.array->end(); break; } @@ -13067,17 +13150,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { - JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end()); + JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end()); return m_it.object_iterator->second; } case value_t::array: { - JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end()); + JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end()); return *m_it.array_iterator; } @@ -13111,17 +13194,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { - JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end()); + JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end()); return &(m_it.object_iterator->second); } case value_t::array: { - JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end()); + JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end()); return &*m_it.array_iterator; } @@ -13164,7 +13247,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { @@ -13215,7 +13298,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { @@ -13262,7 +13345,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: return (m_it.object_iterator == other.m_it.object_iterator); @@ -13307,7 +13390,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", m_object)); @@ -13363,7 +13446,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object)); @@ -13442,7 +13525,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object)); @@ -13471,7 +13554,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", m_object)); @@ -13541,10 +13624,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -13673,13 +13756,55 @@ NLOHMANN_JSON_NAMESPACE_END // #include +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // conditional, is_same + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +/*! +@brief Default base class of the @ref basic_json class. + +So that the correct implementations of the copy / move ctors / assign operators +of @ref basic_json do not require complex case distinctions +(no base class / custom base class used as customization point), +@ref basic_json always has a base class. +By default, this class is used because it is empty and thus has no effect +on the behavior of @ref basic_json. +*/ +struct json_default_base {}; + +template +using json_base_class = typename std::conditional < + std::is_same::value, + json_default_base, + T + >::type; + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -13911,7 +14036,7 @@ class json_pointer const char* p = s.c_str(); char* p_end = nullptr; errno = 0; // strtoull doesn't reset errno - unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int) + const unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int) if (p == p_end // invalid input or empty string || errno == ERANGE // out of range || JSON_HEDLEY_UNLIKELY(static_cast(p_end - p) != s.size())) // incomplete read @@ -14067,7 +14192,7 @@ class json_pointer if (reference_token == "-") { // explicitly treat "-" as index beyond the end - ptr = &ptr->operator[](ptr->m_value.array->size()); + ptr = &ptr->operator[](ptr->m_data.m_value.array->size()); } else { @@ -14119,7 +14244,7 @@ class json_pointer { // "-" always fails the range check JSON_THROW(detail::out_of_range::create(402, detail::concat( - "array index '-' (", std::to_string(ptr->m_value.array->size()), + "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr)); } @@ -14176,7 +14301,7 @@ class json_pointer if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) { // "-" cannot be used for const access - JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_value.array->size()), ") is out of range"), ptr)); + JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr)); } // use unchecked array access @@ -14226,7 +14351,7 @@ class json_pointer { // "-" always fails the range check JSON_THROW(detail::out_of_range::create(402, detail::concat( - "array index '-' (", std::to_string(ptr->m_value.array->size()), + "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr)); } @@ -14421,7 +14546,7 @@ class json_pointer { case detail::value_t::array: { - if (value.m_value.array->empty()) + if (value.m_data.m_value.array->empty()) { // flatten empty array as null result[reference_string] = nullptr; @@ -14429,10 +14554,10 @@ class json_pointer else { // iterate array and use index as reference string - for (std::size_t i = 0; i < value.m_value.array->size(); ++i) + for (std::size_t i = 0; i < value.m_data.m_value.array->size(); ++i) { flatten(detail::concat(reference_string, '/', std::to_string(i)), - value.m_value.array->operator[](i), result); + value.m_data.m_value.array->operator[](i), result); } } break; @@ -14440,7 +14565,7 @@ class json_pointer case detail::value_t::object: { - if (value.m_value.object->empty()) + if (value.m_data.m_value.object->empty()) { // flatten empty object as null result[reference_string] = nullptr; @@ -14448,7 +14573,7 @@ class json_pointer else { // iterate object and use keys as reference string - for (const auto& element : *value.m_value.object) + for (const auto& element : *value.m_data.m_value.object) { flatten(detail::concat(reference_string, '/', detail::escape(element.first)), element.second, result); } @@ -14495,7 +14620,7 @@ class json_pointer BasicJsonType result; // iterate the JSON object values - for (const auto& element : *value.m_value.object) + for (const auto& element : *value.m_data.m_value.object) { if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive())) { @@ -14671,10 +14796,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -14763,10 +14888,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -14789,10 +14914,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -14978,7 +15103,7 @@ class binary_writer { case value_t::object: { - write_bson_object(*j.m_value.object); + write_bson_object(*j.m_data.m_value.object); break; } @@ -15013,7 +15138,7 @@ class binary_writer case value_t::boolean: { - oa->write_character(j.m_value.boolean + oa->write_character(j.m_data.m_value.boolean ? to_char_type(0xF5) : to_char_type(0xF4)); break; @@ -15021,42 +15146,42 @@ class binary_writer case value_t::number_integer: { - if (j.m_value.number_integer >= 0) + if (j.m_data.m_value.number_integer >= 0) { // CBOR does not differentiate between positive signed // integers and unsigned integers. Therefore, we used the // code from the value_t::number_unsigned case here. - if (j.m_value.number_integer <= 0x17) + if (j.m_data.m_value.number_integer <= 0x17) { - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x18)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x19)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x1A)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } else { oa->write_character(to_char_type(0x1B)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } } else { // The conversions below encode the sign in the first // byte, and the value is converted to a positive number. - const auto positive_number = -1 - j.m_value.number_integer; - if (j.m_value.number_integer >= -24) + const auto positive_number = -1 - j.m_data.m_value.number_integer; + if (j.m_data.m_value.number_integer >= -24) { write_number(static_cast(0x20 + positive_number)); } @@ -15086,52 +15211,52 @@ class binary_writer case value_t::number_unsigned: { - if (j.m_value.number_unsigned <= 0x17) + if (j.m_data.m_value.number_unsigned <= 0x17) { - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x18)); - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x19)); - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x1A)); - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } else { oa->write_character(to_char_type(0x1B)); - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } break; } case value_t::number_float: { - if (std::isnan(j.m_value.number_float)) + if (std::isnan(j.m_data.m_value.number_float)) { // NaN is 0xf97e00 in CBOR oa->write_character(to_char_type(0xF9)); oa->write_character(to_char_type(0x7E)); oa->write_character(to_char_type(0x00)); } - else if (std::isinf(j.m_value.number_float)) + else if (std::isinf(j.m_data.m_value.number_float)) { // Infinity is 0xf97c00, -Infinity is 0xf9fc00 oa->write_character(to_char_type(0xf9)); - oa->write_character(j.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC)); + oa->write_character(j.m_data.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC)); oa->write_character(to_char_type(0x00)); } else { - write_compact_float(j.m_value.number_float, detail::input_format_t::cbor); + write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::cbor); } break; } @@ -15139,7 +15264,7 @@ class binary_writer case value_t::string: { // step 1: write control byte and the string length - const auto N = j.m_value.string->size(); + const auto N = j.m_data.m_value.string->size(); if (N <= 0x17) { write_number(static_cast(0x60 + N)); @@ -15169,15 +15294,15 @@ class binary_writer // step 2: write the string oa->write_characters( - reinterpret_cast(j.m_value.string->c_str()), - j.m_value.string->size()); + reinterpret_cast(j.m_data.m_value.string->c_str()), + j.m_data.m_value.string->size()); break; } case value_t::array: { // step 1: write control byte and the array size - const auto N = j.m_value.array->size(); + const auto N = j.m_data.m_value.array->size(); if (N <= 0x17) { write_number(static_cast(0x80 + N)); @@ -15206,7 +15331,7 @@ class binary_writer // LCOV_EXCL_STOP // step 2: write each element - for (const auto& el : *j.m_value.array) + for (const auto& el : *j.m_data.m_value.array) { write_cbor(el); } @@ -15215,32 +15340,32 @@ class binary_writer case value_t::binary: { - if (j.m_value.binary->has_subtype()) + if (j.m_data.m_value.binary->has_subtype()) { - if (j.m_value.binary->subtype() <= (std::numeric_limits::max)()) + if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits::max)()) { write_number(static_cast(0xd8)); - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } - else if (j.m_value.binary->subtype() <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits::max)()) { write_number(static_cast(0xd9)); - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } - else if (j.m_value.binary->subtype() <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits::max)()) { write_number(static_cast(0xda)); - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } - else if (j.m_value.binary->subtype() <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits::max)()) { write_number(static_cast(0xdb)); - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } } // step 1: write control byte and the binary array size - const auto N = j.m_value.binary->size(); + const auto N = j.m_data.m_value.binary->size(); if (N <= 0x17) { write_number(static_cast(0x40 + N)); @@ -15270,7 +15395,7 @@ class binary_writer // step 2: write each element oa->write_characters( - reinterpret_cast(j.m_value.binary->data()), + reinterpret_cast(j.m_data.m_value.binary->data()), N); break; @@ -15279,7 +15404,7 @@ class binary_writer case value_t::object: { // step 1: write control byte and the object size - const auto N = j.m_value.object->size(); + const auto N = j.m_data.m_value.object->size(); if (N <= 0x17) { write_number(static_cast(0xA0 + N)); @@ -15308,7 +15433,7 @@ class binary_writer // LCOV_EXCL_STOP // step 2: write each element - for (const auto& el : *j.m_value.object) + for (const auto& el : *j.m_data.m_value.object) { write_cbor(el.first); write_cbor(el.second); @@ -15337,7 +15462,7 @@ class binary_writer case value_t::boolean: // true and false { - oa->write_character(j.m_value.boolean + oa->write_character(j.m_data.m_value.boolean ? to_char_type(0xC3) : to_char_type(0xC2)); break; @@ -15345,75 +15470,75 @@ class binary_writer case value_t::number_integer: { - if (j.m_value.number_integer >= 0) + if (j.m_data.m_value.number_integer >= 0) { // MessagePack does not differentiate between positive // signed integers and unsigned integers. Therefore, we used // the code from the value_t::number_unsigned case here. - if (j.m_value.number_unsigned < 128) + if (j.m_data.m_value.number_unsigned < 128) { // positive fixnum - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 8 oa->write_character(to_char_type(0xCC)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 16 oa->write_character(to_char_type(0xCD)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 32 oa->write_character(to_char_type(0xCE)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 64 oa->write_character(to_char_type(0xCF)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } } else { - if (j.m_value.number_integer >= -32) + if (j.m_data.m_value.number_integer >= -32) { // negative fixnum - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer >= (std::numeric_limits::min)() && - j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { // int 8 oa->write_character(to_char_type(0xD0)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer >= (std::numeric_limits::min)() && - j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { // int 16 oa->write_character(to_char_type(0xD1)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer >= (std::numeric_limits::min)() && - j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { // int 32 oa->write_character(to_char_type(0xD2)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer >= (std::numeric_limits::min)() && - j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { // int 64 oa->write_character(to_char_type(0xD3)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } } break; @@ -15421,48 +15546,48 @@ class binary_writer case value_t::number_unsigned: { - if (j.m_value.number_unsigned < 128) + if (j.m_data.m_value.number_unsigned < 128) { // positive fixnum - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 8 oa->write_character(to_char_type(0xCC)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 16 oa->write_character(to_char_type(0xCD)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 32 oa->write_character(to_char_type(0xCE)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 64 oa->write_character(to_char_type(0xCF)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } break; } case value_t::number_float: { - write_compact_float(j.m_value.number_float, detail::input_format_t::msgpack); + write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::msgpack); break; } case value_t::string: { // step 1: write control byte and the string length - const auto N = j.m_value.string->size(); + const auto N = j.m_data.m_value.string->size(); if (N <= 31) { // fixstr @@ -15489,15 +15614,15 @@ class binary_writer // step 2: write the string oa->write_characters( - reinterpret_cast(j.m_value.string->c_str()), - j.m_value.string->size()); + reinterpret_cast(j.m_data.m_value.string->c_str()), + j.m_data.m_value.string->size()); break; } case value_t::array: { // step 1: write control byte and the array size - const auto N = j.m_value.array->size(); + const auto N = j.m_data.m_value.array->size(); if (N <= 15) { // fixarray @@ -15517,7 +15642,7 @@ class binary_writer } // step 2: write each element - for (const auto& el : *j.m_value.array) + for (const auto& el : *j.m_data.m_value.array) { write_msgpack(el); } @@ -15528,10 +15653,10 @@ class binary_writer { // step 0: determine if the binary type has a set subtype to // determine whether or not to use the ext or fixext types - const bool use_ext = j.m_value.binary->has_subtype(); + const bool use_ext = j.m_data.m_value.binary->has_subtype(); // step 1: write control byte and the byte string length - const auto N = j.m_value.binary->size(); + const auto N = j.m_data.m_value.binary->size(); if (N <= (std::numeric_limits::max)()) { std::uint8_t output_type{}; @@ -15576,18 +15701,18 @@ class binary_writer } else if (N <= (std::numeric_limits::max)()) { - std::uint8_t output_type = use_ext - ? 0xC8 // ext 16 - : 0xC5; // bin 16 + const std::uint8_t output_type = use_ext + ? 0xC8 // ext 16 + : 0xC5; // bin 16 oa->write_character(to_char_type(output_type)); write_number(static_cast(N)); } else if (N <= (std::numeric_limits::max)()) { - std::uint8_t output_type = use_ext - ? 0xC9 // ext 32 - : 0xC6; // bin 32 + const std::uint8_t output_type = use_ext + ? 0xC9 // ext 32 + : 0xC6; // bin 32 oa->write_character(to_char_type(output_type)); write_number(static_cast(N)); @@ -15596,12 +15721,12 @@ class binary_writer // step 1.5: if this is an ext type, write the subtype if (use_ext) { - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } // step 2: write the byte string oa->write_characters( - reinterpret_cast(j.m_value.binary->data()), + reinterpret_cast(j.m_data.m_value.binary->data()), N); break; @@ -15610,7 +15735,7 @@ class binary_writer case value_t::object: { // step 1: write control byte and the object size - const auto N = j.m_value.object->size(); + const auto N = j.m_data.m_value.object->size(); if (N <= 15) { // fixmap @@ -15630,7 +15755,7 @@ class binary_writer } // step 2: write each element - for (const auto& el : *j.m_value.object) + for (const auto& el : *j.m_data.m_value.object) { write_msgpack(el.first); write_msgpack(el.second); @@ -15670,7 +15795,7 @@ class binary_writer { if (add_prefix) { - oa->write_character(j.m_value.boolean + oa->write_character(j.m_data.m_value.boolean ? to_char_type('T') : to_char_type('F')); } @@ -15679,19 +15804,19 @@ class binary_writer case value_t::number_integer: { - write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.number_integer, add_prefix, use_bjdata); break; } case value_t::number_unsigned: { - write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.number_unsigned, add_prefix, use_bjdata); break; } case value_t::number_float: { - write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.number_float, add_prefix, use_bjdata); break; } @@ -15701,10 +15826,10 @@ class binary_writer { oa->write_character(to_char_type('S')); } - write_number_with_ubjson_prefix(j.m_value.string->size(), true, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.string->size(), true, use_bjdata); oa->write_characters( - reinterpret_cast(j.m_value.string->c_str()), - j.m_value.string->size()); + reinterpret_cast(j.m_data.m_value.string->c_str()), + j.m_data.m_value.string->size()); break; } @@ -15716,7 +15841,7 @@ class binary_writer } bool prefix_required = true; - if (use_type && !j.m_value.array->empty()) + if (use_type && !j.m_data.m_value.array->empty()) { JSON_ASSERT(use_count); const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata); @@ -15739,10 +15864,10 @@ class binary_writer if (use_count) { oa->write_character(to_char_type('#')); - write_number_with_ubjson_prefix(j.m_value.array->size(), true, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.array->size(), true, use_bjdata); } - for (const auto& el : *j.m_value.array) + for (const auto& el : *j.m_data.m_value.array) { write_ubjson(el, use_count, use_type, prefix_required, use_bjdata); } @@ -15762,7 +15887,7 @@ class binary_writer oa->write_character(to_char_type('[')); } - if (use_type && !j.m_value.binary->empty()) + if (use_type && !j.m_data.m_value.binary->empty()) { JSON_ASSERT(use_count); oa->write_character(to_char_type('$')); @@ -15772,21 +15897,21 @@ class binary_writer if (use_count) { oa->write_character(to_char_type('#')); - write_number_with_ubjson_prefix(j.m_value.binary->size(), true, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.binary->size(), true, use_bjdata); } if (use_type) { oa->write_characters( - reinterpret_cast(j.m_value.binary->data()), - j.m_value.binary->size()); + reinterpret_cast(j.m_data.m_value.binary->data()), + j.m_data.m_value.binary->size()); } else { - for (size_t i = 0; i < j.m_value.binary->size(); ++i) + for (size_t i = 0; i < j.m_data.m_value.binary->size(); ++i) { oa->write_character(to_char_type('U')); - oa->write_character(j.m_value.binary->data()[i]); + oa->write_character(j.m_data.m_value.binary->data()[i]); } } @@ -15800,9 +15925,9 @@ class binary_writer case value_t::object: { - if (use_bjdata && j.m_value.object->size() == 3 && j.m_value.object->find("_ArrayType_") != j.m_value.object->end() && j.m_value.object->find("_ArraySize_") != j.m_value.object->end() && j.m_value.object->find("_ArrayData_") != j.m_value.object->end()) + if (use_bjdata && j.m_data.m_value.object->size() == 3 && j.m_data.m_value.object->find("_ArrayType_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArraySize_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArrayData_") != j.m_data.m_value.object->end()) { - if (!write_bjdata_ndarray(*j.m_value.object, use_count, use_type)) // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata) + if (!write_bjdata_ndarray(*j.m_data.m_value.object, use_count, use_type)) // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata) { break; } @@ -15814,7 +15939,7 @@ class binary_writer } bool prefix_required = true; - if (use_type && !j.m_value.object->empty()) + if (use_type && !j.m_data.m_value.object->empty()) { JSON_ASSERT(use_count); const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata); @@ -15837,10 +15962,10 @@ class binary_writer if (use_count) { oa->write_character(to_char_type('#')); - write_number_with_ubjson_prefix(j.m_value.object->size(), true, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.object->size(), true, use_bjdata); } - for (const auto& el : *j.m_value.object) + for (const auto& el : *j.m_data.m_value.object) { write_number_with_ubjson_prefix(el.first.size(), true, use_bjdata); oa->write_characters( @@ -15990,19 +16115,19 @@ class binary_writer void write_bson_unsigned(const string_t& name, const BasicJsonType& j) { - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { write_bson_entry_header(name, 0x10 /* int32 */); - write_number(static_cast(j.m_value.number_unsigned), true); + write_number(static_cast(j.m_data.m_value.number_unsigned), true); } - else if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + else if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { write_bson_entry_header(name, 0x12 /* int64 */); - write_number(static_cast(j.m_value.number_unsigned), true); + write_number(static_cast(j.m_data.m_value.number_unsigned), true); } else { - JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j)); + JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_data.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j)); } } @@ -16083,13 +16208,13 @@ class binary_writer switch (j.type()) { case value_t::object: - return header_size + calc_bson_object_size(*j.m_value.object); + return header_size + calc_bson_object_size(*j.m_data.m_value.object); case value_t::array: - return header_size + calc_bson_array_size(*j.m_value.array); + return header_size + calc_bson_array_size(*j.m_data.m_value.array); case value_t::binary: - return header_size + calc_bson_binary_size(*j.m_value.binary); + return header_size + calc_bson_binary_size(*j.m_data.m_value.binary); case value_t::boolean: return header_size + 1ul; @@ -16098,13 +16223,13 @@ class binary_writer return header_size + 8ul; case value_t::number_integer: - return header_size + calc_bson_integer_size(j.m_value.number_integer); + return header_size + calc_bson_integer_size(j.m_data.m_value.number_integer); case value_t::number_unsigned: - return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned); + return header_size + calc_bson_unsigned_size(j.m_data.m_value.number_unsigned); case value_t::string: - return header_size + calc_bson_string_size(*j.m_value.string); + return header_size + calc_bson_string_size(*j.m_data.m_value.string); case value_t::null: return header_size + 0ul; @@ -16130,28 +16255,28 @@ class binary_writer switch (j.type()) { case value_t::object: - return write_bson_object_entry(name, *j.m_value.object); + return write_bson_object_entry(name, *j.m_data.m_value.object); case value_t::array: - return write_bson_array(name, *j.m_value.array); + return write_bson_array(name, *j.m_data.m_value.array); case value_t::binary: - return write_bson_binary(name, *j.m_value.binary); + return write_bson_binary(name, *j.m_data.m_value.binary); case value_t::boolean: - return write_bson_boolean(name, j.m_value.boolean); + return write_bson_boolean(name, j.m_data.m_value.boolean); case value_t::number_float: - return write_bson_double(name, j.m_value.number_float); + return write_bson_double(name, j.m_data.m_value.number_float); case value_t::number_integer: - return write_bson_integer(name, j.m_value.number_integer); + return write_bson_integer(name, j.m_data.m_value.number_integer); case value_t::number_unsigned: return write_bson_unsigned(name, j); case value_t::string: - return write_bson_string(name, *j.m_value.string); + return write_bson_string(name, *j.m_data.m_value.string); case value_t::null: return write_bson_null(name); @@ -16173,8 +16298,8 @@ class binary_writer */ static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value) { - std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast(0), - [](size_t result, const typename BasicJsonType::object_t::value_type & el) + const std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast(0), + [](size_t result, const typename BasicJsonType::object_t::value_type & el) { return result += calc_bson_element_size(el.first, el.second); }); @@ -16424,35 +16549,35 @@ class binary_writer return 'Z'; case value_t::boolean: - return j.m_value.boolean ? 'T' : 'F'; + return j.m_data.m_value.boolean ? 'T' : 'F'; case value_t::number_integer: { - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'i'; } - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'U'; } - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'I'; } - if (use_bjdata && ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)())) + if (use_bjdata && ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)())) { return 'u'; } - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'l'; } - if (use_bjdata && ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)())) + if (use_bjdata && ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)())) { return 'm'; } - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'L'; } @@ -16462,35 +16587,35 @@ class binary_writer case value_t::number_unsigned: { - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'i'; } - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'U'; } - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'I'; } - if (use_bjdata && j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'u'; } - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'l'; } - if (use_bjdata && j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'm'; } - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'L'; } - if (use_bjdata && j.m_value.number_unsigned <= (std::numeric_limits::max)()) + if (use_bjdata && j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { return 'M'; } @@ -16499,7 +16624,7 @@ class binary_writer } case value_t::number_float: - return get_ubjson_float_prefix(j.m_value.number_float); + return get_ubjson_float_prefix(j.m_data.m_value.number_float); case value_t::string: return 'S'; @@ -16548,7 +16673,7 @@ class binary_writer std::size_t len = (value.at(key).empty() ? 0 : 1); for (const auto& el : value.at(key)) { - len *= static_cast(el.m_value.number_unsigned); + len *= static_cast(el.m_data.m_value.number_unsigned); } key = "_ArrayData_"; @@ -16570,70 +16695,70 @@ class binary_writer { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_unsigned), true); + write_number(static_cast(el.m_data.m_value.number_unsigned), true); } } else if (dtype == 'i') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_integer), true); + write_number(static_cast(el.m_data.m_value.number_integer), true); } } else if (dtype == 'u') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_unsigned), true); + write_number(static_cast(el.m_data.m_value.number_unsigned), true); } } else if (dtype == 'I') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_integer), true); + write_number(static_cast(el.m_data.m_value.number_integer), true); } } else if (dtype == 'm') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_unsigned), true); + write_number(static_cast(el.m_data.m_value.number_unsigned), true); } } else if (dtype == 'l') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_integer), true); + write_number(static_cast(el.m_data.m_value.number_integer), true); } } else if (dtype == 'M') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_unsigned), true); + write_number(static_cast(el.m_data.m_value.number_unsigned), true); } } else if (dtype == 'L') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_integer), true); + write_number(static_cast(el.m_data.m_value.number_integer), true); } } else if (dtype == 'd') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_float), true); + write_number(static_cast(el.m_data.m_value.number_float), true); } } else if (dtype == 'D') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_float), true); + write_number(static_cast(el.m_data.m_value.number_float), true); } } return false; @@ -16757,11 +16882,11 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // // SPDX-FileCopyrightText: 2008-2009 Björn Hoehrmann -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -16782,11 +16907,11 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // // SPDX-FileCopyrightText: 2009 Florian Loitsch -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -17692,7 +17817,7 @@ void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value) // NB: If the neighbors are computed for single-precision numbers, there is a single float // (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision // value is off by 1 ulp. -#if 0 +#if 0 // NOLINT(readability-avoid-unconditional-preprocessor-if) const boundaries w = compute_boundaries(static_cast(value)); #else const boundaries w = compute_boundaries(value); @@ -17994,11 +18119,11 @@ class serializer const unsigned int indent_step, const unsigned int current_indent = 0) { - switch (val.m_type) + switch (val.m_data.m_type) { case value_t::object: { - if (val.m_value.object->empty()) + if (val.m_data.m_value.object->empty()) { o->write_characters("{}", 2); return; @@ -18016,8 +18141,8 @@ class serializer } // first n-1 elements - auto i = val.m_value.object->cbegin(); - for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i) + auto i = val.m_data.m_value.object->cbegin(); + for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i) { o->write_characters(indent_string.c_str(), new_indent); o->write_character('\"'); @@ -18028,8 +18153,8 @@ class serializer } // last element - JSON_ASSERT(i != val.m_value.object->cend()); - JSON_ASSERT(std::next(i) == val.m_value.object->cend()); + JSON_ASSERT(i != val.m_data.m_value.object->cend()); + JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend()); o->write_characters(indent_string.c_str(), new_indent); o->write_character('\"'); dump_escaped(i->first, ensure_ascii); @@ -18045,8 +18170,8 @@ class serializer o->write_character('{'); // first n-1 elements - auto i = val.m_value.object->cbegin(); - for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i) + auto i = val.m_data.m_value.object->cbegin(); + for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i) { o->write_character('\"'); dump_escaped(i->first, ensure_ascii); @@ -18056,8 +18181,8 @@ class serializer } // last element - JSON_ASSERT(i != val.m_value.object->cend()); - JSON_ASSERT(std::next(i) == val.m_value.object->cend()); + JSON_ASSERT(i != val.m_data.m_value.object->cend()); + JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend()); o->write_character('\"'); dump_escaped(i->first, ensure_ascii); o->write_characters("\":", 2); @@ -18071,7 +18196,7 @@ class serializer case value_t::array: { - if (val.m_value.array->empty()) + if (val.m_data.m_value.array->empty()) { o->write_characters("[]", 2); return; @@ -18089,8 +18214,8 @@ class serializer } // first n-1 elements - for (auto i = val.m_value.array->cbegin(); - i != val.m_value.array->cend() - 1; ++i) + for (auto i = val.m_data.m_value.array->cbegin(); + i != val.m_data.m_value.array->cend() - 1; ++i) { o->write_characters(indent_string.c_str(), new_indent); dump(*i, true, ensure_ascii, indent_step, new_indent); @@ -18098,9 +18223,9 @@ class serializer } // last element - JSON_ASSERT(!val.m_value.array->empty()); + JSON_ASSERT(!val.m_data.m_value.array->empty()); o->write_characters(indent_string.c_str(), new_indent); - dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent); + dump(val.m_data.m_value.array->back(), true, ensure_ascii, indent_step, new_indent); o->write_character('\n'); o->write_characters(indent_string.c_str(), current_indent); @@ -18111,16 +18236,16 @@ class serializer o->write_character('['); // first n-1 elements - for (auto i = val.m_value.array->cbegin(); - i != val.m_value.array->cend() - 1; ++i) + for (auto i = val.m_data.m_value.array->cbegin(); + i != val.m_data.m_value.array->cend() - 1; ++i) { dump(*i, false, ensure_ascii, indent_step, current_indent); o->write_character(','); } // last element - JSON_ASSERT(!val.m_value.array->empty()); - dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent); + JSON_ASSERT(!val.m_data.m_value.array->empty()); + dump(val.m_data.m_value.array->back(), false, ensure_ascii, indent_step, current_indent); o->write_character(']'); } @@ -18131,7 +18256,7 @@ class serializer case value_t::string: { o->write_character('\"'); - dump_escaped(*val.m_value.string, ensure_ascii); + dump_escaped(*val.m_data.m_value.string, ensure_ascii); o->write_character('\"'); return; } @@ -18153,24 +18278,24 @@ class serializer o->write_characters("\"bytes\": [", 10); - if (!val.m_value.binary->empty()) + if (!val.m_data.m_value.binary->empty()) { - for (auto i = val.m_value.binary->cbegin(); - i != val.m_value.binary->cend() - 1; ++i) + for (auto i = val.m_data.m_value.binary->cbegin(); + i != val.m_data.m_value.binary->cend() - 1; ++i) { dump_integer(*i); o->write_characters(", ", 2); } - dump_integer(val.m_value.binary->back()); + dump_integer(val.m_data.m_value.binary->back()); } o->write_characters("],\n", 3); o->write_characters(indent_string.c_str(), new_indent); o->write_characters("\"subtype\": ", 11); - if (val.m_value.binary->has_subtype()) + if (val.m_data.m_value.binary->has_subtype()) { - dump_integer(val.m_value.binary->subtype()); + dump_integer(val.m_data.m_value.binary->subtype()); } else { @@ -18184,21 +18309,21 @@ class serializer { o->write_characters("{\"bytes\":[", 10); - if (!val.m_value.binary->empty()) + if (!val.m_data.m_value.binary->empty()) { - for (auto i = val.m_value.binary->cbegin(); - i != val.m_value.binary->cend() - 1; ++i) + for (auto i = val.m_data.m_value.binary->cbegin(); + i != val.m_data.m_value.binary->cend() - 1; ++i) { dump_integer(*i); o->write_character(','); } - dump_integer(val.m_value.binary->back()); + dump_integer(val.m_data.m_value.binary->back()); } o->write_characters("],\"subtype\":", 12); - if (val.m_value.binary->has_subtype()) + if (val.m_data.m_value.binary->has_subtype()) { - dump_integer(val.m_value.binary->subtype()); + dump_integer(val.m_data.m_value.binary->subtype()); o->write_character('}'); } else @@ -18211,7 +18336,7 @@ class serializer case value_t::boolean: { - if (val.m_value.boolean) + if (val.m_data.m_value.boolean) { o->write_characters("true", 4); } @@ -18224,19 +18349,19 @@ class serializer case value_t::number_integer: { - dump_integer(val.m_value.number_integer); + dump_integer(val.m_data.m_value.number_integer); return; } case value_t::number_unsigned: { - dump_integer(val.m_value.number_unsigned); + dump_integer(val.m_data.m_value.number_unsigned); return; } case value_t::number_float: { - dump_float(val.m_value.number_float); + dump_float(val.m_data.m_value.number_float); return; } @@ -18810,8 +18935,8 @@ class serializer ? (byte & 0x3fu) | (codep << 6u) : (0xFFu >> type) & (byte); - std::size_t index = 256u + static_cast(state) * 16u + static_cast(type); - JSON_ASSERT(index < 400); + const std::size_t index = 256u + static_cast(state) * 16u + static_cast(type); + JSON_ASSERT(index < utf8d.size()); state = utf8d[index]; return state; } @@ -18878,10 +19003,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -18998,7 +19123,7 @@ template , template::value, int> = 0> - T & at(KeyType && key) + T & at(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19026,7 +19151,7 @@ template , template::value, int> = 0> - const T & at(KeyType && key) const + const T & at(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19060,7 +19185,7 @@ template , template::value, int> = 0> - size_type erase(KeyType && key) + size_type erase(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19151,7 +19276,7 @@ template , template::value, int> = 0> - size_type count(KeyType && key) const + size_type count(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19177,7 +19302,7 @@ template , template::value, int> = 0> - iterator find(KeyType && key) + iterator find(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19240,7 +19365,9 @@ NLOHMANN_JSON_NAMESPACE_END #if defined(JSON_HAS_CPP_17) - #include + #if JSON_HAS_STATIC_RTTI + #include + #endif #include #endif @@ -19271,6 +19398,7 @@ The invariants are checked by member function assert_invariant(). */ NLOHMANN_BASIC_JSON_TPL_DECLARATION class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) + : public ::nlohmann::detail::json_base_class { private: template friend struct detail::external_constructor; @@ -19297,6 +19425,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// workaround type for MSVC using basic_json_t = NLOHMANN_BASIC_JSON_TPL; + using json_base_class_t = ::nlohmann::detail::json_base_class; JSON_PRIVATE_UNLESS_TESTED: // convenience aliases for types residing in namespace detail; @@ -19368,7 +19497,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - ///////////////////// // container types // ///////////////////// @@ -19410,7 +19538,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - /// @brief returns the allocator associated with the container /// @sa https://json.nlohmann.me/api/basic_json/get_allocator/ static allocator_type get_allocator() @@ -19425,7 +19552,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { basic_json result; - result["copyright"] = "(C) 2013-2022 Niels Lohmann"; + result["copyright"] = "(C) 2013-2023 Niels Lohmann"; result["name"] = "JSON for Modern C++"; result["url"] = "https://github.com/nlohmann/json"; result["version"]["string"] = @@ -19473,7 +19600,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}}; #endif - #if defined(_MSVC_LANG) result["compiler"]["c++"] = std::to_string(_MSVC_LANG); #elif defined(__cplusplus) @@ -19484,7 +19610,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec return result; } - /////////////////////////// // JSON value data types // /////////////////////////// @@ -19692,7 +19817,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec object = nullptr; // silence warning, see #821 if (JSON_HEDLEY_UNLIKELY(t == value_t::null)) { - JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.2", nullptr)); // LCOV_EXCL_LINE + JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.3", nullptr)); // LCOV_EXCL_LINE } break; } @@ -19731,6 +19856,16 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec void destroy(value_t t) { + if ( + (t == value_t::object && object == nullptr) || + (t == value_t::array && array == nullptr) || + (t == value_t::string && string == nullptr) || + (t == value_t::binary && binary == nullptr) + ) + { + //not initialized (e.g. due to exception in the ctor) + return; + } if (t == value_t::array || t == value_t::object) { // flatten the current json_value to a heap-allocated stack @@ -19761,18 +19896,18 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // its children to the stack to be processed later if (current_item.is_array()) { - std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(), std::back_inserter(stack)); + std::move(current_item.m_data.m_value.array->begin(), current_item.m_data.m_value.array->end(), std::back_inserter(stack)); - current_item.m_value.array->clear(); + current_item.m_data.m_value.array->clear(); } else if (current_item.is_object()) { - for (auto&& it : *current_item.m_value.object) + for (auto&& it : *current_item.m_data.m_value.object) { stack.push_back(std::move(it.second)); } - current_item.m_value.object->clear(); + current_item.m_data.m_value.object->clear(); } // it's now safe that current_item get destructed @@ -19849,10 +19984,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec */ void assert_invariant(bool check_parents = true) const noexcept { - JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr); - JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr); - JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr); - JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr); + JSON_ASSERT(m_data.m_type != value_t::object || m_data.m_value.object != nullptr); + JSON_ASSERT(m_data.m_type != value_t::array || m_data.m_value.array != nullptr); + JSON_ASSERT(m_data.m_type != value_t::string || m_data.m_value.string != nullptr); + JSON_ASSERT(m_data.m_type != value_t::binary || m_data.m_value.binary != nullptr); #if JSON_DIAGNOSTICS JSON_TRY @@ -19871,11 +20006,11 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec void set_parents() { #if JSON_DIAGNOSTICS - switch (m_type) + switch (m_data.m_type) { case value_t::array: { - for (auto& element : *m_value.array) + for (auto& element : *m_data.m_value.array) { element.m_parent = this; } @@ -19884,7 +20019,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec case value_t::object: { - for (auto& element : *m_value.object) + for (auto& element : *m_data.m_value.object) { element.second.m_parent = this; } @@ -19925,7 +20060,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { // see https://github.com/nlohmann/json/issues/2838 JSON_ASSERT(type() == value_t::array); - if (JSON_HEDLEY_UNLIKELY(m_value.array->capacity() != old_capacity)) + if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity)) { // capacity has changed: update all parents set_parents(); @@ -19981,7 +20116,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief create an empty value with a given type /// @sa https://json.nlohmann.me/api/basic_json/basic_json/ basic_json(const value_t v) - : m_type(v), m_value(v) + : m_data(v) { assert_invariant(); } @@ -20055,12 +20190,12 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec *this = nullptr; break; case value_t::discarded: - m_type = value_t::discarded; + m_data.m_type = value_t::discarded; break; default: // LCOV_EXCL_LINE JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } - JSON_ASSERT(m_type == val.type()); + JSON_ASSERT(m_data.m_type == val.type()); set_parents(); assert_invariant(); } @@ -20076,7 +20211,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec bool is_an_object = std::all_of(init.begin(), init.end(), [](const detail::json_ref& element_ref) { - return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string(); + // The cast is to ensure op[size_type] is called, bearing in mind size_type may not be int; + // (many string types can be constructed from 0 via its null-pointer guise, so we get a + // broken call to op[key_type], the wrong semantics and a 4804 warning on Windows) + return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[static_cast(0)].is_string(); }); // adjust type if type deduction is not wanted @@ -20098,22 +20236,22 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_an_object) { // the initializer list is a list of pairs -> create object - m_type = value_t::object; - m_value = value_t::object; + m_data.m_type = value_t::object; + m_data.m_value = value_t::object; for (auto& element_ref : init) { auto element = element_ref.moved_or_copied(); - m_value.object->emplace( - std::move(*((*element.m_value.array)[0].m_value.string)), - std::move((*element.m_value.array)[1])); + m_data.m_value.object->emplace( + std::move(*((*element.m_data.m_value.array)[0].m_data.m_value.string)), + std::move((*element.m_data.m_value.array)[1])); } } else { // the initializer list describes an array -> create array - m_type = value_t::array; - m_value.array = create(init.begin(), init.end()); + m_data.m_type = value_t::array; + m_data.m_value.array = create(init.begin(), init.end()); } set_parents(); @@ -20126,8 +20264,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec static basic_json binary(const typename binary_t::container_type& init) { auto res = basic_json(); - res.m_type = value_t::binary; - res.m_value = init; + res.m_data.m_type = value_t::binary; + res.m_data.m_value = init; return res; } @@ -20137,8 +20275,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec static basic_json binary(const typename binary_t::container_type& init, typename binary_t::subtype_type subtype) { auto res = basic_json(); - res.m_type = value_t::binary; - res.m_value = binary_t(init, subtype); + res.m_data.m_type = value_t::binary; + res.m_data.m_value = binary_t(init, subtype); return res; } @@ -20148,8 +20286,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec static basic_json binary(typename binary_t::container_type&& init) { auto res = basic_json(); - res.m_type = value_t::binary; - res.m_value = std::move(init); + res.m_data.m_type = value_t::binary; + res.m_data.m_value = std::move(init); return res; } @@ -20159,8 +20297,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec static basic_json binary(typename binary_t::container_type&& init, typename binary_t::subtype_type subtype) { auto res = basic_json(); - res.m_type = value_t::binary; - res.m_value = binary_t(std::move(init), subtype); + res.m_data.m_type = value_t::binary; + res.m_data.m_value = binary_t(std::move(init), subtype); return res; } @@ -20182,10 +20320,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief construct an array with count copies of given value /// @sa https://json.nlohmann.me/api/basic_json/basic_json/ - basic_json(size_type cnt, const basic_json& val) - : m_type(value_t::array) + basic_json(size_type cnt, const basic_json& val): + m_data{cnt, val} { - m_value.array = create(cnt, val); set_parents(); assert_invariant(); } @@ -20207,10 +20344,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec } // copy type from first iterator - m_type = first.m_object->m_type; + m_data.m_type = first.m_object->m_data.m_type; // check if iterator range is complete for primitive values - switch (m_type) + switch (m_data.m_type) { case value_t::boolean: case value_t::number_float: @@ -20235,55 +20372,55 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec break; } - switch (m_type) + switch (m_data.m_type) { case value_t::number_integer: { - m_value.number_integer = first.m_object->m_value.number_integer; + m_data.m_value.number_integer = first.m_object->m_data.m_value.number_integer; break; } case value_t::number_unsigned: { - m_value.number_unsigned = first.m_object->m_value.number_unsigned; + m_data.m_value.number_unsigned = first.m_object->m_data.m_value.number_unsigned; break; } case value_t::number_float: { - m_value.number_float = first.m_object->m_value.number_float; + m_data.m_value.number_float = first.m_object->m_data.m_value.number_float; break; } case value_t::boolean: { - m_value.boolean = first.m_object->m_value.boolean; + m_data.m_value.boolean = first.m_object->m_data.m_value.boolean; break; } case value_t::string: { - m_value = *first.m_object->m_value.string; + m_data.m_value = *first.m_object->m_data.m_value.string; break; } case value_t::object: { - m_value.object = create(first.m_it.object_iterator, - last.m_it.object_iterator); + m_data.m_value.object = create(first.m_it.object_iterator, + last.m_it.object_iterator); break; } case value_t::array: { - m_value.array = create(first.m_it.array_iterator, - last.m_it.array_iterator); + m_data.m_value.array = create(first.m_it.array_iterator, + last.m_it.array_iterator); break; } case value_t::binary: { - m_value = *first.m_object->m_value.binary; + m_data.m_value = *first.m_object->m_data.m_value.binary; break; } @@ -20297,7 +20434,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec assert_invariant(); } - /////////////////////////////////////// // other constructors and destructor // /////////////////////////////////////// @@ -20310,58 +20446,59 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief copy constructor /// @sa https://json.nlohmann.me/api/basic_json/basic_json/ basic_json(const basic_json& other) - : m_type(other.m_type) + : json_base_class_t(other) { + m_data.m_type = other.m_data.m_type; // check of passed value is valid other.assert_invariant(); - switch (m_type) + switch (m_data.m_type) { case value_t::object: { - m_value = *other.m_value.object; + m_data.m_value = *other.m_data.m_value.object; break; } case value_t::array: { - m_value = *other.m_value.array; + m_data.m_value = *other.m_data.m_value.array; break; } case value_t::string: { - m_value = *other.m_value.string; + m_data.m_value = *other.m_data.m_value.string; break; } case value_t::boolean: { - m_value = other.m_value.boolean; + m_data.m_value = other.m_data.m_value.boolean; break; } case value_t::number_integer: { - m_value = other.m_value.number_integer; + m_data.m_value = other.m_data.m_value.number_integer; break; } case value_t::number_unsigned: { - m_value = other.m_value.number_unsigned; + m_data.m_value = other.m_data.m_value.number_unsigned; break; } case value_t::number_float: { - m_value = other.m_value.number_float; + m_data.m_value = other.m_data.m_value.number_float; break; } case value_t::binary: { - m_value = *other.m_value.binary; + m_data.m_value = *other.m_data.m_value.binary; break; } @@ -20378,15 +20515,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief move constructor /// @sa https://json.nlohmann.me/api/basic_json/basic_json/ basic_json(basic_json&& other) noexcept - : m_type(std::move(other.m_type)), - m_value(std::move(other.m_value)) + : json_base_class_t(std::forward(other)), + m_data(std::move(other.m_data)) { // check that passed value is valid other.assert_invariant(false); // invalidate payload - other.m_type = value_t::null; - other.m_value = {}; + other.m_data.m_type = value_t::null; + other.m_data.m_value = {}; set_parents(); assert_invariant(); @@ -20398,15 +20535,17 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec std::is_nothrow_move_constructible::value&& std::is_nothrow_move_assignable::value&& std::is_nothrow_move_constructible::value&& - std::is_nothrow_move_assignable::value + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_assignable::value ) { // check that passed value is valid other.assert_invariant(); using std::swap; - swap(m_type, other.m_type); - swap(m_value, other.m_value); + swap(m_data.m_type, other.m_data.m_type); + swap(m_data.m_value, other.m_data.m_value); + json_base_class_t::operator=(std::move(other)); set_parents(); assert_invariant(); @@ -20418,7 +20557,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec ~basic_json() noexcept { assert_invariant(false); - m_value.destroy(m_type); } /// @} @@ -20458,7 +20596,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/type/ constexpr value_t type() const noexcept { - return m_type; + return m_data.m_type; } /// @brief return whether type is primitive @@ -20479,14 +20617,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/is_null/ constexpr bool is_null() const noexcept { - return m_type == value_t::null; + return m_data.m_type == value_t::null; } /// @brief return whether value is a boolean /// @sa https://json.nlohmann.me/api/basic_json/is_boolean/ constexpr bool is_boolean() const noexcept { - return m_type == value_t::boolean; + return m_data.m_type == value_t::boolean; } /// @brief return whether value is a number @@ -20500,63 +20638,63 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/is_number_integer/ constexpr bool is_number_integer() const noexcept { - return m_type == value_t::number_integer || m_type == value_t::number_unsigned; + return m_data.m_type == value_t::number_integer || m_data.m_type == value_t::number_unsigned; } /// @brief return whether value is an unsigned integer number /// @sa https://json.nlohmann.me/api/basic_json/is_number_unsigned/ constexpr bool is_number_unsigned() const noexcept { - return m_type == value_t::number_unsigned; + return m_data.m_type == value_t::number_unsigned; } /// @brief return whether value is a floating-point number /// @sa https://json.nlohmann.me/api/basic_json/is_number_float/ constexpr bool is_number_float() const noexcept { - return m_type == value_t::number_float; + return m_data.m_type == value_t::number_float; } /// @brief return whether value is an object /// @sa https://json.nlohmann.me/api/basic_json/is_object/ constexpr bool is_object() const noexcept { - return m_type == value_t::object; + return m_data.m_type == value_t::object; } /// @brief return whether value is an array /// @sa https://json.nlohmann.me/api/basic_json/is_array/ constexpr bool is_array() const noexcept { - return m_type == value_t::array; + return m_data.m_type == value_t::array; } /// @brief return whether value is a string /// @sa https://json.nlohmann.me/api/basic_json/is_string/ constexpr bool is_string() const noexcept { - return m_type == value_t::string; + return m_data.m_type == value_t::string; } /// @brief return whether value is a binary array /// @sa https://json.nlohmann.me/api/basic_json/is_binary/ constexpr bool is_binary() const noexcept { - return m_type == value_t::binary; + return m_data.m_type == value_t::binary; } /// @brief return whether value is discarded /// @sa https://json.nlohmann.me/api/basic_json/is_discarded/ constexpr bool is_discarded() const noexcept { - return m_type == value_t::discarded; + return m_data.m_type == value_t::discarded; } /// @brief return the type of the JSON value (implicit) /// @sa https://json.nlohmann.me/api/basic_json/operator_value_t/ constexpr operator value_t() const noexcept { - return m_type; + return m_data.m_type; } /// @} @@ -20571,7 +20709,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { if (JSON_HEDLEY_LIKELY(is_boolean())) { - return m_value.boolean; + return m_data.m_value.boolean; } JSON_THROW(type_error::create(302, detail::concat("type must be boolean, but is ", type_name()), this)); @@ -20580,97 +20718,97 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// get a pointer to the value (object) object_t* get_impl_ptr(object_t* /*unused*/) noexcept { - return is_object() ? m_value.object : nullptr; + return is_object() ? m_data.m_value.object : nullptr; } /// get a pointer to the value (object) constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept { - return is_object() ? m_value.object : nullptr; + return is_object() ? m_data.m_value.object : nullptr; } /// get a pointer to the value (array) array_t* get_impl_ptr(array_t* /*unused*/) noexcept { - return is_array() ? m_value.array : nullptr; + return is_array() ? m_data.m_value.array : nullptr; } /// get a pointer to the value (array) constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept { - return is_array() ? m_value.array : nullptr; + return is_array() ? m_data.m_value.array : nullptr; } /// get a pointer to the value (string) string_t* get_impl_ptr(string_t* /*unused*/) noexcept { - return is_string() ? m_value.string : nullptr; + return is_string() ? m_data.m_value.string : nullptr; } /// get a pointer to the value (string) constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept { - return is_string() ? m_value.string : nullptr; + return is_string() ? m_data.m_value.string : nullptr; } /// get a pointer to the value (boolean) boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept { - return is_boolean() ? &m_value.boolean : nullptr; + return is_boolean() ? &m_data.m_value.boolean : nullptr; } /// get a pointer to the value (boolean) constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept { - return is_boolean() ? &m_value.boolean : nullptr; + return is_boolean() ? &m_data.m_value.boolean : nullptr; } /// get a pointer to the value (integer number) number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept { - return is_number_integer() ? &m_value.number_integer : nullptr; + return is_number_integer() ? &m_data.m_value.number_integer : nullptr; } /// get a pointer to the value (integer number) constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept { - return is_number_integer() ? &m_value.number_integer : nullptr; + return is_number_integer() ? &m_data.m_value.number_integer : nullptr; } /// get a pointer to the value (unsigned number) number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept { - return is_number_unsigned() ? &m_value.number_unsigned : nullptr; + return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr; } /// get a pointer to the value (unsigned number) constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept { - return is_number_unsigned() ? &m_value.number_unsigned : nullptr; + return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr; } /// get a pointer to the value (floating-point number) number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept { - return is_number_float() ? &m_value.number_float : nullptr; + return is_number_float() ? &m_data.m_value.number_float : nullptr; } /// get a pointer to the value (floating-point number) constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept { - return is_number_float() ? &m_value.number_float : nullptr; + return is_number_float() ? &m_data.m_value.number_float : nullptr; } /// get a pointer to the value (binary) binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept { - return is_binary() ? m_value.binary : nullptr; + return is_binary() ? m_data.m_value.binary : nullptr; } /// get a pointer to the value (binary) constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept { - return is_binary() ? m_value.binary : nullptr; + return is_binary() ? m_data.m_value.binary : nullptr; } /*! @@ -21053,7 +21191,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec #if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914)) detail::negation>, #endif -#if defined(JSON_HAS_CPP_17) +#if defined(JSON_HAS_CPP_17) && JSON_HAS_STATIC_RTTI detail::negation>, #endif detail::is_detected_lazy @@ -21090,7 +21228,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - //////////////////// // element access // //////////////////// @@ -21108,7 +21245,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { JSON_TRY { - return set_parent(m_value.array->at(idx)); + return set_parent(m_data.m_value.array->at(idx)); } JSON_CATCH (std::out_of_range&) { @@ -21131,7 +21268,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { JSON_TRY { - return m_value.array->at(idx); + return m_data.m_value.array->at(idx); } JSON_CATCH (std::out_of_range&) { @@ -21155,8 +21292,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this)); } - auto it = m_value.object->find(key); - if (it == m_value.object->end()) + auto it = m_data.m_value.object->find(key); + if (it == m_data.m_value.object->end()) { JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this)); } @@ -21175,8 +21312,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this)); } - auto it = m_value.object->find(std::forward(key)); - if (it == m_value.object->end()) + auto it = m_data.m_value.object->find(std::forward(key)); + if (it == m_data.m_value.object->end()) { JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward(key)), "' not found"), this)); } @@ -21193,8 +21330,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this)); } - auto it = m_value.object->find(key); - if (it == m_value.object->end()) + auto it = m_data.m_value.object->find(key); + if (it == m_data.m_value.object->end()) { JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this)); } @@ -21213,8 +21350,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this)); } - auto it = m_value.object->find(std::forward(key)); - if (it == m_value.object->end()) + auto it = m_data.m_value.object->find(std::forward(key)); + if (it == m_data.m_value.object->end()) { JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward(key)), "' not found"), this)); } @@ -21228,8 +21365,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // implicitly convert null value to an empty array if (is_null()) { - m_type = value_t::array; - m_value.array = create(); + m_data.m_type = value_t::array; + m_data.m_value.array = create(); assert_invariant(); } @@ -21237,17 +21374,17 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (JSON_HEDLEY_LIKELY(is_array())) { // fill up array with null values if given idx is outside range - if (idx >= m_value.array->size()) + if (idx >= m_data.m_value.array->size()) { #if JSON_DIAGNOSTICS // remember array size & capacity before resizing - const auto old_size = m_value.array->size(); - const auto old_capacity = m_value.array->capacity(); + const auto old_size = m_data.m_value.array->size(); + const auto old_capacity = m_data.m_value.array->capacity(); #endif - m_value.array->resize(idx + 1); + m_data.m_value.array->resize(idx + 1); #if JSON_DIAGNOSTICS - if (JSON_HEDLEY_UNLIKELY(m_value.array->capacity() != old_capacity)) + if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity)) { // capacity has changed: update all parents set_parents(); @@ -21261,7 +21398,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec assert_invariant(); } - return m_value.array->operator[](idx); + return m_data.m_value.array->operator[](idx); } JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this)); @@ -21274,7 +21411,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // const operator[] only works for arrays if (JSON_HEDLEY_LIKELY(is_array())) { - return m_value.array->operator[](idx); + return m_data.m_value.array->operator[](idx); } JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this)); @@ -21287,15 +21424,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // implicitly convert null value to an empty object if (is_null()) { - m_type = value_t::object; - m_value.object = create(); + m_data.m_type = value_t::object; + m_data.m_value.object = create(); assert_invariant(); } // operator[] only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { - auto result = m_value.object->emplace(std::move(key), nullptr); + auto result = m_data.m_value.object->emplace(std::move(key), nullptr); return set_parent(result.first->second); } @@ -21309,8 +21446,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // const operator[] only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { - auto it = m_value.object->find(key); - JSON_ASSERT(it != m_value.object->end()); + auto it = m_data.m_value.object->find(key); + JSON_ASSERT(it != m_data.m_value.object->end()); return it->second; } @@ -21340,15 +21477,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // implicitly convert null value to an empty object if (is_null()) { - m_type = value_t::object; - m_value.object = create(); + m_data.m_type = value_t::object; + m_data.m_value.object = create(); assert_invariant(); } // operator[] only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { - auto result = m_value.object->emplace(std::forward(key), nullptr); + auto result = m_data.m_value.object->emplace(std::forward(key), nullptr); return set_parent(result.first->second); } @@ -21364,8 +21501,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // const operator[] only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { - auto it = m_value.object->find(std::forward(key)); - JSON_ASSERT(it != m_value.object->end()); + auto it = m_data.m_value.object->find(std::forward(key)); + JSON_ASSERT(it != m_data.m_value.object->end()); return it->second; } @@ -21602,7 +21739,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec IteratorType result = end(); - switch (m_type) + switch (m_data.m_type) { case value_t::boolean: case value_t::number_float: @@ -21619,32 +21756,32 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_string()) { AllocatorType alloc; - std::allocator_traits::destroy(alloc, m_value.string); - std::allocator_traits::deallocate(alloc, m_value.string, 1); - m_value.string = nullptr; + std::allocator_traits::destroy(alloc, m_data.m_value.string); + std::allocator_traits::deallocate(alloc, m_data.m_value.string, 1); + m_data.m_value.string = nullptr; } else if (is_binary()) { AllocatorType alloc; - std::allocator_traits::destroy(alloc, m_value.binary); - std::allocator_traits::deallocate(alloc, m_value.binary, 1); - m_value.binary = nullptr; + std::allocator_traits::destroy(alloc, m_data.m_value.binary); + std::allocator_traits::deallocate(alloc, m_data.m_value.binary, 1); + m_data.m_value.binary = nullptr; } - m_type = value_t::null; + m_data.m_type = value_t::null; assert_invariant(); break; } case value_t::object: { - result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator); + result.m_it.object_iterator = m_data.m_value.object->erase(pos.m_it.object_iterator); break; } case value_t::array: { - result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator); + result.m_it.array_iterator = m_data.m_value.array->erase(pos.m_it.array_iterator); break; } @@ -21672,7 +21809,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec IteratorType result = end(); - switch (m_type) + switch (m_data.m_type) { case value_t::boolean: case value_t::number_float: @@ -21690,33 +21827,33 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_string()) { AllocatorType alloc; - std::allocator_traits::destroy(alloc, m_value.string); - std::allocator_traits::deallocate(alloc, m_value.string, 1); - m_value.string = nullptr; + std::allocator_traits::destroy(alloc, m_data.m_value.string); + std::allocator_traits::deallocate(alloc, m_data.m_value.string, 1); + m_data.m_value.string = nullptr; } else if (is_binary()) { AllocatorType alloc; - std::allocator_traits::destroy(alloc, m_value.binary); - std::allocator_traits::deallocate(alloc, m_value.binary, 1); - m_value.binary = nullptr; + std::allocator_traits::destroy(alloc, m_data.m_value.binary); + std::allocator_traits::deallocate(alloc, m_data.m_value.binary, 1); + m_data.m_value.binary = nullptr; } - m_type = value_t::null; + m_data.m_type = value_t::null; assert_invariant(); break; } case value_t::object: { - result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator, + result.m_it.object_iterator = m_data.m_value.object->erase(first.m_it.object_iterator, last.m_it.object_iterator); break; } case value_t::array: { - result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator, + result.m_it.array_iterator = m_data.m_value.array->erase(first.m_it.array_iterator, last.m_it.array_iterator); break; } @@ -21741,7 +21878,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this)); } - return m_value.object->erase(std::forward(key)); + return m_data.m_value.object->erase(std::forward(key)); } template < typename KeyType, detail::enable_if_t < @@ -21754,10 +21891,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this)); } - const auto it = m_value.object->find(std::forward(key)); - if (it != m_value.object->end()) + const auto it = m_data.m_value.object->find(std::forward(key)); + if (it != m_data.m_value.object->end()) { - m_value.object->erase(it); + m_data.m_value.object->erase(it); return 1; } return 0; @@ -21795,7 +21932,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this)); } - m_value.array->erase(m_value.array->begin() + static_cast(idx)); + m_data.m_value.array->erase(m_data.m_value.array->begin() + static_cast(idx)); } else { @@ -21805,7 +21942,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - //////////// // lookup // //////////// @@ -21821,7 +21957,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_object()) { - result.m_it.object_iterator = m_value.object->find(key); + result.m_it.object_iterator = m_data.m_value.object->find(key); } return result; @@ -21835,7 +21971,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_object()) { - result.m_it.object_iterator = m_value.object->find(key); + result.m_it.object_iterator = m_data.m_value.object->find(key); } return result; @@ -21851,7 +21987,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_object()) { - result.m_it.object_iterator = m_value.object->find(std::forward(key)); + result.m_it.object_iterator = m_data.m_value.object->find(std::forward(key)); } return result; @@ -21867,7 +22003,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_object()) { - result.m_it.object_iterator = m_value.object->find(std::forward(key)); + result.m_it.object_iterator = m_data.m_value.object->find(std::forward(key)); } return result; @@ -21878,7 +22014,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec size_type count(const typename object_t::key_type& key) const { // return 0 for all nonobject types - return is_object() ? m_value.object->count(key) : 0; + return is_object() ? m_data.m_value.object->count(key) : 0; } /// @brief returns the number of occurrences of a key in a JSON object @@ -21888,14 +22024,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec size_type count(KeyType && key) const { // return 0 for all nonobject types - return is_object() ? m_value.object->count(std::forward(key)) : 0; + return is_object() ? m_data.m_value.object->count(std::forward(key)) : 0; } /// @brief check the existence of an element in a JSON object /// @sa https://json.nlohmann.me/api/basic_json/contains/ bool contains(const typename object_t::key_type& key) const { - return is_object() && m_value.object->find(key) != m_value.object->end(); + return is_object() && m_data.m_value.object->find(key) != m_data.m_value.object->end(); } /// @brief check the existence of an element in a JSON object @@ -21904,7 +22040,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec detail::is_usable_as_basic_json_key_type::value, int> = 0> bool contains(KeyType && key) const { - return is_object() && m_value.object->find(std::forward(key)) != m_value.object->end(); + return is_object() && m_data.m_value.object->find(std::forward(key)) != m_data.m_value.object->end(); } /// @brief check the existence of an element in a JSON object given a JSON pointer @@ -21923,7 +22059,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - /////////////// // iterators // /////////////// @@ -22062,7 +22197,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - ////////////// // capacity // ////////////// @@ -22074,7 +22208,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/empty/ bool empty() const noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::null: { @@ -22085,13 +22219,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec case value_t::array: { // delegate call to array_t::empty() - return m_value.array->empty(); + return m_data.m_value.array->empty(); } case value_t::object: { // delegate call to object_t::empty() - return m_value.object->empty(); + return m_data.m_value.object->empty(); } case value_t::string: @@ -22113,7 +22247,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/size/ size_type size() const noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::null: { @@ -22124,13 +22258,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec case value_t::array: { // delegate call to array_t::size() - return m_value.array->size(); + return m_data.m_value.array->size(); } case value_t::object: { // delegate call to object_t::size() - return m_value.object->size(); + return m_data.m_value.object->size(); } case value_t::string: @@ -22152,18 +22286,18 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/max_size/ size_type max_size() const noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::array: { // delegate call to array_t::max_size() - return m_value.array->max_size(); + return m_data.m_value.array->max_size(); } case value_t::object: { // delegate call to object_t::max_size() - return m_value.object->max_size(); + return m_data.m_value.object->max_size(); } case value_t::null: @@ -22184,7 +22318,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - /////////////// // modifiers // /////////////// @@ -22196,53 +22329,53 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/clear/ void clear() noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::number_integer: { - m_value.number_integer = 0; + m_data.m_value.number_integer = 0; break; } case value_t::number_unsigned: { - m_value.number_unsigned = 0; + m_data.m_value.number_unsigned = 0; break; } case value_t::number_float: { - m_value.number_float = 0.0; + m_data.m_value.number_float = 0.0; break; } case value_t::boolean: { - m_value.boolean = false; + m_data.m_value.boolean = false; break; } case value_t::string: { - m_value.string->clear(); + m_data.m_value.string->clear(); break; } case value_t::binary: { - m_value.binary->clear(); + m_data.m_value.binary->clear(); break; } case value_t::array: { - m_value.array->clear(); + m_data.m_value.array->clear(); break; } case value_t::object: { - m_value.object->clear(); + m_data.m_value.object->clear(); break; } @@ -22266,15 +22399,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an array if (is_null()) { - m_type = value_t::array; - m_value = value_t::array; + m_data.m_type = value_t::array; + m_data.m_value = value_t::array; assert_invariant(); } // add element to array (move semantics) - const auto old_capacity = m_value.array->capacity(); - m_value.array->push_back(std::move(val)); - set_parent(m_value.array->back(), old_capacity); + const auto old_capacity = m_data.m_value.array->capacity(); + m_data.m_value.array->push_back(std::move(val)); + set_parent(m_data.m_value.array->back(), old_capacity); // if val is moved from, basic_json move constructor marks it null, so we do not call the destructor } @@ -22299,15 +22432,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an array if (is_null()) { - m_type = value_t::array; - m_value = value_t::array; + m_data.m_type = value_t::array; + m_data.m_value = value_t::array; assert_invariant(); } // add element to array - const auto old_capacity = m_value.array->capacity(); - m_value.array->push_back(val); - set_parent(m_value.array->back(), old_capacity); + const auto old_capacity = m_data.m_value.array->capacity(); + m_data.m_value.array->push_back(val); + set_parent(m_data.m_value.array->back(), old_capacity); } /// @brief add an object to an array @@ -22331,13 +22464,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an object if (is_null()) { - m_type = value_t::object; - m_value = value_t::object; + m_data.m_type = value_t::object; + m_data.m_value = value_t::object; assert_invariant(); } // add element to object - auto res = m_value.object->insert(val); + auto res = m_data.m_value.object->insert(val); set_parent(res.first->second); } @@ -22387,15 +22520,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an array if (is_null()) { - m_type = value_t::array; - m_value = value_t::array; + m_data.m_type = value_t::array; + m_data.m_value = value_t::array; assert_invariant(); } // add element to array (perfect forwarding) - const auto old_capacity = m_value.array->capacity(); - m_value.array->emplace_back(std::forward(args)...); - return set_parent(m_value.array->back(), old_capacity); + const auto old_capacity = m_data.m_value.array->capacity(); + m_data.m_value.array->emplace_back(std::forward(args)...); + return set_parent(m_data.m_value.array->back(), old_capacity); } /// @brief add an object to an object if key does not exist @@ -22412,13 +22545,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an object if (is_null()) { - m_type = value_t::object; - m_value = value_t::object; + m_data.m_type = value_t::object; + m_data.m_value = value_t::object; assert_invariant(); } // add element to array (perfect forwarding) - auto res = m_value.object->emplace(std::forward(args)...); + auto res = m_data.m_value.object->emplace(std::forward(args)...); set_parent(res.first->second); // create result iterator and set iterator to the result of emplace @@ -22436,14 +22569,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec iterator insert_iterator(const_iterator pos, Args&& ... args) { iterator result(this); - JSON_ASSERT(m_value.array != nullptr); + JSON_ASSERT(m_data.m_value.array != nullptr); - auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator); - m_value.array->insert(pos.m_it.array_iterator, std::forward(args)...); - result.m_it.array_iterator = m_value.array->begin() + insert_pos; + auto insert_pos = std::distance(m_data.m_value.array->begin(), pos.m_it.array_iterator); + m_data.m_value.array->insert(pos.m_it.array_iterator, std::forward(args)...); + result.m_it.array_iterator = m_data.m_value.array->begin() + insert_pos; // This could have been written as: - // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val); + // result.m_it.array_iterator = m_data.m_value.array->insert(pos.m_it.array_iterator, cnt, val); // but the return value of insert is missing in GCC 4.8, so it is written this way instead. set_parents(); @@ -22570,7 +22703,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", this)); } - m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator); + m_data.m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator); } /// @brief updates a JSON object from another object, overwriting existing keys @@ -22587,8 +22720,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // implicitly convert null value to an empty object if (is_null()) { - m_type = value_t::object; - m_value.object = create(); + m_data.m_type = value_t::object; + m_data.m_value.object = create(); assert_invariant(); } @@ -22613,16 +22746,16 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { if (merge_objects && it.value().is_object()) { - auto it2 = m_value.object->find(it.key()); - if (it2 != m_value.object->end()) + auto it2 = m_data.m_value.object->find(it.key()); + if (it2 != m_data.m_value.object->end()) { it2->second.update(it.value(), true); continue; } } - m_value.object->operator[](it.key()) = it.value(); + m_data.m_value.object->operator[](it.key()) = it.value(); #if JSON_DIAGNOSTICS - m_value.object->operator[](it.key()).m_parent = this; + m_data.m_value.object->operator[](it.key()).m_parent = this; #endif } } @@ -22632,12 +22765,12 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec void swap(reference other) noexcept ( std::is_nothrow_move_constructible::value&& std::is_nothrow_move_assignable::value&& - std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_constructible::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap) std::is_nothrow_move_assignable::value ) { - std::swap(m_type, other.m_type); - std::swap(m_value, other.m_value); + std::swap(m_data.m_type, other.m_data.m_type); + std::swap(m_data.m_value, other.m_data.m_value); set_parents(); other.set_parents(); @@ -22649,7 +22782,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec friend void swap(reference left, reference right) noexcept ( std::is_nothrow_move_constructible::value&& std::is_nothrow_move_assignable::value&& - std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_constructible::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap) std::is_nothrow_move_assignable::value ) { @@ -22658,13 +22791,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief exchanges the values /// @sa https://json.nlohmann.me/api/basic_json/swap/ - void swap(array_t& other) // NOLINT(bugprone-exception-escape) + void swap(array_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) { // swap only works for arrays if (JSON_HEDLEY_LIKELY(is_array())) { using std::swap; - swap(*(m_value.array), other); + swap(*(m_data.m_value.array), other); } else { @@ -22674,13 +22807,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief exchanges the values /// @sa https://json.nlohmann.me/api/basic_json/swap/ - void swap(object_t& other) // NOLINT(bugprone-exception-escape) + void swap(object_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) { // swap only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { using std::swap; - swap(*(m_value.object), other); + swap(*(m_data.m_value.object), other); } else { @@ -22690,13 +22823,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief exchanges the values /// @sa https://json.nlohmann.me/api/basic_json/swap/ - void swap(string_t& other) // NOLINT(bugprone-exception-escape) + void swap(string_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) { // swap only works for strings if (JSON_HEDLEY_LIKELY(is_string())) { using std::swap; - swap(*(m_value.string), other); + swap(*(m_data.m_value.string), other); } else { @@ -22706,13 +22839,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief exchanges the values /// @sa https://json.nlohmann.me/api/basic_json/swap/ - void swap(binary_t& other) // NOLINT(bugprone-exception-escape) + void swap(binary_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) { // swap only works for strings if (JSON_HEDLEY_LIKELY(is_binary())) { using std::swap; - swap(*(m_value.binary), other); + swap(*(m_data.m_value.binary), other); } else { @@ -22728,7 +22861,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (JSON_HEDLEY_LIKELY(is_binary())) { using std::swap; - swap(*(m_value.binary), other); + swap(*(m_data.m_value.binary), other); } else { @@ -22756,31 +22889,31 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec switch (lhs_type) \ { \ case value_t::array: \ - return (*lhs.m_value.array) op (*rhs.m_value.array); \ + return (*lhs.m_data.m_value.array) op (*rhs.m_data.m_value.array); \ \ case value_t::object: \ - return (*lhs.m_value.object) op (*rhs.m_value.object); \ + return (*lhs.m_data.m_value.object) op (*rhs.m_data.m_value.object); \ \ case value_t::null: \ return (null_result); \ \ case value_t::string: \ - return (*lhs.m_value.string) op (*rhs.m_value.string); \ + return (*lhs.m_data.m_value.string) op (*rhs.m_data.m_value.string); \ \ case value_t::boolean: \ - return (lhs.m_value.boolean) op (rhs.m_value.boolean); \ + return (lhs.m_data.m_value.boolean) op (rhs.m_data.m_value.boolean); \ \ case value_t::number_integer: \ - return (lhs.m_value.number_integer) op (rhs.m_value.number_integer); \ + return (lhs.m_data.m_value.number_integer) op (rhs.m_data.m_value.number_integer); \ \ case value_t::number_unsigned: \ - return (lhs.m_value.number_unsigned) op (rhs.m_value.number_unsigned); \ + return (lhs.m_data.m_value.number_unsigned) op (rhs.m_data.m_value.number_unsigned); \ \ case value_t::number_float: \ - return (lhs.m_value.number_float) op (rhs.m_value.number_float); \ + return (lhs.m_data.m_value.number_float) op (rhs.m_data.m_value.number_float); \ \ case value_t::binary: \ - return (*lhs.m_value.binary) op (*rhs.m_value.binary); \ + return (*lhs.m_data.m_value.binary) op (*rhs.m_data.m_value.binary); \ \ case value_t::discarded: \ default: \ @@ -22789,27 +22922,27 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec } \ else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float) \ { \ - return static_cast(lhs.m_value.number_integer) op rhs.m_value.number_float; \ + return static_cast(lhs.m_data.m_value.number_integer) op rhs.m_data.m_value.number_float; \ } \ else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer) \ { \ - return lhs.m_value.number_float op static_cast(rhs.m_value.number_integer); \ + return lhs.m_data.m_value.number_float op static_cast(rhs.m_data.m_value.number_integer); \ } \ else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float) \ { \ - return static_cast(lhs.m_value.number_unsigned) op rhs.m_value.number_float; \ + return static_cast(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_float; \ } \ else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned) \ { \ - return lhs.m_value.number_float op static_cast(rhs.m_value.number_unsigned); \ + return lhs.m_data.m_value.number_float op static_cast(rhs.m_data.m_value.number_unsigned); \ } \ else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer) \ { \ - return static_cast(lhs.m_value.number_unsigned) op rhs.m_value.number_integer; \ + return static_cast(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_integer; \ } \ else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned) \ { \ - return lhs.m_value.number_integer op static_cast(rhs.m_value.number_unsigned); \ + return lhs.m_data.m_value.number_integer op static_cast(rhs.m_data.m_value.number_unsigned); \ } \ else if(compares_unordered(lhs, rhs))\ {\ @@ -22826,8 +22959,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // an operation is computed as an odd number of inverses of others static bool compares_unordered(const_reference lhs, const_reference rhs, bool inverse = false) noexcept { - if ((lhs.is_number_float() && std::isnan(lhs.m_value.number_float) && rhs.is_number()) - || (rhs.is_number_float() && std::isnan(rhs.m_value.number_float) && lhs.is_number())) + if ((lhs.is_number_float() && std::isnan(lhs.m_data.m_value.number_float) && rhs.is_number()) + || (rhs.is_number_float() && std::isnan(rhs.m_data.m_value.number_float) && lhs.is_number())) { return true; } @@ -23171,7 +23304,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec #endif // JSON_NO_IO /// @} - ///////////////////// // deserialization // ///////////////////// @@ -23328,7 +23460,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_HEDLEY_RETURNS_NON_NULL const char* type_name() const noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::null: return "null"; @@ -23352,17 +23484,43 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec } } - JSON_PRIVATE_UNLESS_TESTED: ////////////////////// // member variables // ////////////////////// - /// the type of the current element - value_t m_type = value_t::null; + struct data + { + /// the type of the current element + value_t m_type = value_t::null; - /// the value of the current element - json_value m_value = {}; + /// the value of the current element + json_value m_value = {}; + + data(const value_t v) + : m_type(v), m_value(v) + { + } + + data(size_type cnt, const basic_json& val) + : m_type(value_t::array) + { + m_value.array = create(cnt, val); + } + + data() noexcept = default; + data(data&&) noexcept = default; + data(const data&) noexcept = delete; + data& operator=(data&&) noexcept = delete; + data& operator=(const data&) noexcept = delete; + + ~data() noexcept + { + m_value.destroy(m_type); + } + }; + + data m_data = {}; #if JSON_DIAGNOSTICS /// a pointer to a parent value (for debugging purposes) @@ -23543,7 +23701,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler); } - JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len)) static basic_json from_cbor(detail::span_input_adapter&& i, @@ -23667,7 +23824,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec return res ? result : basic_json(value_t::discarded); } - /// @brief create a JSON value from an input in BJData format /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/ template @@ -23890,7 +24046,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec } // make sure the top element of the pointer exists - json_pointer top_pointer = ptr.top(); + json_pointer const top_pointer = ptr.top(); if (top_pointer != ptr) { result.at(top_pointer); @@ -23902,7 +24058,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // parent must exist when performing patch add per RFC6902 specs basic_json& parent = result.at(ptr); - switch (parent.m_type) + switch (parent.m_data.m_type) { case value_t::null: case value_t::object: @@ -23948,7 +24104,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec }; // wrapper for "remove" operation; remove value at ptr - const auto operation_remove = [this, &result](json_pointer & ptr) + const auto operation_remove = [this, & result](json_pointer & ptr) { // get reference to parent of JSON pointer ptr const auto last_path = ptr.back(); @@ -23991,13 +24147,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec bool string_type) -> basic_json & { // find value - auto it = val.m_value.object->find(member); + auto it = val.m_data.m_value.object->find(member); // context-sensitive error message - const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\''); + const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\''); // NOLINT(bugprone-unused-local-non-trivial-variable) // check if desired value is present - if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end())) + if (JSON_HEDLEY_UNLIKELY(it == val.m_data.m_value.object->end())) { // NOLINTNEXTLINE(performance-inefficient-string-concatenation) JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have member '", member, "'"), &val)); @@ -24052,7 +24208,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec json_pointer from_ptr(from_path); // the "from" location must exist - use at() - basic_json v = result.at(from_ptr); + basic_json const v = result.at(from_ptr); // The move operation is functionally identical to a // "remove" operation on the "from" location, followed @@ -24069,7 +24225,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec const json_pointer from_ptr(from_path); // the "from" location must exist - use at() - basic_json v = result.at(from_ptr); + basic_json const v = result.at(from_ptr); // The copy is functionally identical to an "add" // operation at the target location using the value @@ -24311,7 +24467,11 @@ inline namespace json_literals /// @brief user-defined string literal for JSON values /// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json/ JSON_HEDLEY_NON_NULL(1) -inline nlohmann::json operator "" _json(const char* s, std::size_t n) +#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) + inline nlohmann::json operator ""_json(const char* s, std::size_t n) +#else + inline nlohmann::json operator "" _json(const char* s, std::size_t n) +#endif { return nlohmann::json::parse(s, s + n); } @@ -24319,7 +24479,11 @@ inline nlohmann::json operator "" _json(const char* s, std::size_t n) /// @brief user-defined string literal for JSON pointer /// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json_pointer/ JSON_HEDLEY_NON_NULL(1) -inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n) +#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) + inline nlohmann::json::json_pointer operator ""_json_pointer(const char* s, std::size_t n) +#else + inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n) +#endif { return nlohmann::json::json_pointer(std::string(s, n)); } @@ -24338,7 +24502,7 @@ namespace std // NOLINT(cert-dcl58-cpp) /// @brief hash value for JSON objects /// @sa https://json.nlohmann.me/api/basic_json/std_hash/ NLOHMANN_BASIC_JSON_TPL_DECLARATION -struct hash +struct hash // NOLINT(cert-dcl58-cpp) { std::size_t operator()(const nlohmann::NLOHMANN_BASIC_JSON_TPL& j) const { @@ -24371,8 +24535,8 @@ struct less< ::nlohmann::detail::value_t> // do not remove the space after '<', /// @brief exchanges the values of two JSON objects /// @sa https://json.nlohmann.me/api/basic_json/std_swap/ NLOHMANN_BASIC_JSON_TPL_DECLARATION -inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept( // NOLINT(readability-inconsistent-declaration-parameter-name) - is_nothrow_move_constructible::value&& // NOLINT(misc-redundant-expression) +inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept( // NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp) + is_nothrow_move_constructible::value&& // NOLINT(misc-redundant-expression,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) is_nothrow_move_assignable::value) { j1.swap(j2); @@ -24383,17 +24547,22 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC } // namespace std #if JSON_USE_GLOBAL_UDLS - using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers) - using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers) + #if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) + using nlohmann::literals::json_literals::operator ""_json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers) + using nlohmann::literals::json_literals::operator ""_json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers) + #else + using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers) + using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers) + #endif #endif // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -24428,16 +24597,17 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM #undef JSON_HAS_THREE_WAY_COMPARISON #undef JSON_HAS_RANGES + #undef JSON_HAS_STATIC_RTTI #undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON #endif // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT diff --git a/common/log.h b/common/log.h index e4e1b9f4f..e4edcac7d 100644 --- a/common/log.h +++ b/common/log.h @@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: // INTERNAL, DO NOT USE // USE LOG() INSTEAD // -#ifndef _MSC_VER +#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER) #define LOG_IMPL(str, ...) \ do { \ if (LOG_TARGET != nullptr) \ @@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: // INTERNAL, DO NOT USE // USE LOG_TEE() INSTEAD // -#ifndef _MSC_VER +#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER) #define LOG_TEE_IMPL(str, ...) \ do { \ if (LOG_TARGET != nullptr) \ @@ -297,7 +297,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: #ifndef _MSC_VER #define LOG(...) LOG_IMPL(__VA_ARGS__, "") #else - #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "") + #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") #endif // Main TEE macro. @@ -311,7 +311,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: #ifndef _MSC_VER #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #else - #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "") + #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") #endif // LOG macro variants with auto endline. @@ -319,8 +319,8 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #else - #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n") - #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n") + #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n") + #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n") #endif // INTERNAL, DO NOT USE @@ -566,6 +566,7 @@ inline void log_print_usage() printf(" --log-new Create a separate new log file on start. " "Each log file will have unique name: \"..log\"\n"); printf(" --log-append Don't truncate the old log file.\n"); + printf("\n"); } #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp new file mode 100644 index 000000000..3ca112ef1 --- /dev/null +++ b/common/ngram-cache.cpp @@ -0,0 +1,282 @@ +#include "ngram-cache.h" +#include "common.h" +#include "log.h" + +#include +#include + +void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, + std::vector & inp, int nnew, bool print_progress) { + const int64_t t_start_ms = ggml_time_ms(); + const int64_t inp_size = inp.size(); + + const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1); + int64_t n_done = 0; + + for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) { + const int64_t i_start = std::max(inp_size - nnew, ngram_size); + for (int64_t i = i_start; i < inp_size; ++i) { + const int64_t ngram_start = i - ngram_size; + llama_ngram ngram(&inp[ngram_start], ngram_size); + const llama_token token = inp[i]; + + llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); + if (part_it == ngram_cache.end()) { + llama_ngram_cache_part part; + part.emplace(token, 1); + ngram_cache.emplace(ngram, part); + } else { + llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); + if (token_count_it == part_it->second.end()) { + part_it->second.emplace(token, 1); + } else { + token_count_it->second++; + } + } + ++n_done; + + if (print_progress && n_done % 10000000 == 0) { + const int64_t t_now_ms = ggml_time_ms(); + const int64_t eta_ms = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done; + const int64_t eta_min = eta_ms / (60*1000); + const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; + + fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s); + } + } + } +} + +// Helper function to get a token from the combined, speculative sequence of inp and draft. +static llama_token get_token(const std::vector & inp, const std::vector & draft, const size_t i) { + return i < inp.size() ? inp[i] : draft[1 + i - inp.size()]; +} + +// If sample size or percentage are below these thresholds the draft is aborted early: +constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1}; +constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50}; +constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; +constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; + +// Helper function that tries to draft a token from only the static ngram cache: +static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { + llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + if (part_static_it == nc_static.end()) { + return -1; + } + const llama_ngram_cache_part part_static = part_static_it->second; + + int max_count_static = 0; + int sum_count_static = 0; + llama_token max_token = -1; + + for (std::pair token_count_static : part_static) { + const llama_token token = token_count_static.first; + const int32_t count_static = token_count_static.second; + + if (count_static > max_count_static) { + max_token = token; + max_count_static = count_static; + } + sum_count_static += count_static; + } + + if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { + return -1; + } + if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { + return -1; + } + return max_token; +} + +// Try to draft a token from primary cache (context/dynamic), validate with static cache: +static llama_token try_draft( + llama_ngram_cache & nc_primary, const std::vector & ngrams_primary, llama_ngram_cache_part & part_static, + const int * min_sample_size, const int * min_percent) { + + llama_token drafted_token = -1; + + for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { + const llama_ngram ngram_primary = ngrams_primary[i]; + + llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); + if (part_primary_it == nc_primary.end()) { + continue; + } + const llama_ngram_cache_part part_primary = part_primary_it->second; + + int max_count_primary = 0; + int max_count_static = 0; + int sum_count_primary = 0; + llama_token max_token = -1; + + for (std::pair token_count_primary : part_primary) { + const llama_token token = token_count_primary.first; + + llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); + + const int32_t count_primary = token_count_primary.second; + const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; + + if (count_primary*count_static > max_count_primary*max_count_static) { + max_token = token; + max_count_primary = count_primary; + max_count_static = count_static; + } + sum_count_primary += count_primary; + } + + if (sum_count_primary < min_sample_size[i]) { + continue; + } + if (100*max_count_primary < min_percent[i]*sum_count_primary) { + continue;; + } + drafted_token = max_token; + } + + return drafted_token; +} + +void llama_ngram_cache_draft( + std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, + llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static +) { + GGML_ASSERT(draft.size() == 1); + const int inp_size = inp.size(); + + if (inp_size < LLAMA_NGRAM_STATIC) { + return; + } + + while ((int) draft.size()-1 < n_draft) { + llama_token drafted_token = -1; + + const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; + llama_ngram ngram_static; + for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { + ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); + } + llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + llama_ngram_cache_part part_static; + if (part_static_it != nc_static.end()) { + part_static = part_static_it->second; + } + + // cd = context + dynamic + std::vector ngrams_cd; + for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { + const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; + llama_ngram ngram_cd; + for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { + ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); + } + ngrams_cd.push_back(ngram_cd); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_static, ngram_static); + } + + if (drafted_token == -1) { + break; + } + + LOG(" - draft candidate: token=%d\n", drafted_token); + draft.push_back(drafted_token); + } +} + +void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { + std::ofstream file_out(filename, std::ios::binary); + for (std::pair item : ngram_cache) { + const llama_ngram ngram = item.first; + llama_ngram_cache_part token_counts = item.second; + GGML_ASSERT(!token_counts.empty()); + const int32_t ntokens = token_counts.size(); + GGML_ASSERT(ntokens > 0); + + file_out.write(reinterpret_cast(&ngram), sizeof(llama_ngram)); + file_out.write(reinterpret_cast(&ntokens), sizeof(int32_t)); + for (std::pair item2 : token_counts) { + const llama_token token = item2.first; + const int32_t count = item2.second; + GGML_ASSERT(count > 0); + + file_out.write(reinterpret_cast(&token), sizeof(llama_token)); + file_out.write(reinterpret_cast(&count), sizeof(int32_t)); + } + } + +} + +llama_ngram_cache llama_ngram_cache_load(std::string & filename) { + std::ifstream hashmap_file(filename, std::ios::binary); + if (!hashmap_file) { + throw std::ifstream::failure("Unable to open file " + filename); + } + llama_ngram_cache ngram_cache; + + llama_ngram ngram; + int32_t ntokens; + llama_token token; + int32_t count; + + char * ngramc = reinterpret_cast(&ngram); + char * ntokensc = reinterpret_cast(&ntokens); + char * tokenc = reinterpret_cast(&token); + char * countc = reinterpret_cast(&count); + while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); + GGML_ASSERT(ntokens > 0); + llama_ngram_cache_part token_counts; + + for (int i = 0; i < ntokens; ++i) { + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token))); + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t))); + GGML_ASSERT(count > 0); + token_counts.emplace(token, count); + } + + ngram_cache.emplace(ngram, token_counts); + } + GGML_ASSERT(hashmap_file.eof()); + + return ngram_cache; +} + +void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { + for (std::pair ngram_part : ngram_cache_add) { + const llama_ngram ngram = ngram_part.first; + llama_ngram_cache_part part = ngram_part.second; + + llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); + if (part_merged_it == ngram_cache_target.end()) { + ngram_cache_target.emplace(ngram, part); + continue; + } + + for (std::pair token_count : part) { + const llama_token token = token_count.first; + const int32_t count = token_count.second; + GGML_ASSERT(count > 0); + + llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); + if (token_count_merged_it == part_merged_it->second.end()) { + part_merged_it->second.emplace(token, count); + continue; + } + + token_count_merged_it->second += count; + } + } +} diff --git a/common/ngram-cache.h b/common/ngram-cache.h new file mode 100644 index 000000000..e4fa4cbd1 --- /dev/null +++ b/common/ngram-cache.h @@ -0,0 +1,94 @@ +#pragma once + +#include "llama.h" + +#include +#include +#include + +#define LLAMA_NGRAM_MIN 1 +#define LLAMA_NGRAM_MAX 4 +#define LLAMA_NGRAM_STATIC 2 + +// Data structures to map n-grams to empirical token probabilities: + +struct llama_ngram { + llama_token tokens[LLAMA_NGRAM_MAX]; + + llama_ngram() { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + tokens[i] = -1; + } + } + + llama_ngram(const llama_token * input, const int ngram_size) { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + tokens[i] = i < ngram_size ? input[i] : -1; + } + } + + bool operator==(const llama_ngram & other) const { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + if (tokens[i] != other.tokens[i]) { + return false; + } + } + return true; + } +}; + +struct llama_ngram_hash_function { + size_t operator()(const llama_ngram & ngram) const { + size_t hash = 0; + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + hash ^= std::hash{}(ngram.tokens[i]); + } + return hash; + } +}; + +// token -> number of times token has been seen +typedef std::unordered_map llama_ngram_cache_part; + +// n-gram -> empirical distribution of following tokens +typedef std::unordered_map llama_ngram_cache; + + +// Update an ngram cache with tokens. +// ngram_cache: the cache to modify. +// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data. +// inp_data: the token sequence with which to update ngram_cache. +// nnew: how many new tokens have been appended to inp_data since the last call to this function. +// print_progress: whether to print progress to stderr. +// +// In order to get correct results inp_data can ONLY BE APPENDED TO. +// Changes in the middle need a complete rebuild. +void llama_ngram_cache_update( + llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp_data, int nnew, bool print_progress); + +// Try to draft tokens from ngram caches. +// inp: the tokens generated so far. +// draft: the token sequence to draft. Expected to initially contain the previously sampled token. +// n_draft: maximum number of tokens to add to draft. +// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic. +// nc_context: ngram cache based on current context. +// nc_dynamic: ngram cache based on previous user generations. +// nc_static: ngram cache generated from a large text corpus, used for validation. +void llama_ngram_cache_draft( + std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, + llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); + +// Save an ngram cache to a file. +// ngram_cache: the ngram cache to save. +// filename: the path under which to save the ngram cache. +void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); + +// Load an ngram cache saved with llama_ngram_cache_save. +// filename: the path from which to load the ngram cache. +// returns: an ngram cache containing the information saved to filename. +llama_ngram_cache llama_ngram_cache_load(std::string & filename); + +// Merge two ngram caches. +// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. +// ngram_cache_add: the ngram cache to add to ngram_cache_target. +void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add); diff --git a/common/sampling.cpp b/common/sampling.cpp index 8e45909f1..45d68b26c 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -13,6 +13,14 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ // will be empty (default) if there are parse errors if (result->parsed_grammar.rules.empty()) { fprintf(stderr, "%s: failed to parse grammar\n", __func__); + delete result; + return nullptr; + } + + // Ensure that there is a "root" node. + if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) { + fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__); + delete result; return nullptr; } @@ -102,15 +110,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) { std::string llama_sampling_order_print(const llama_sampling_params & params) { std::string result = "CFG -> Penalties "; if (params.mirostat == 0) { - for (auto s : params.samplers_sequence) { - switch (s) { - case 'k': result += "-> top_k "; break; - case 'f': result += "-> tfs_z "; break; - case 'y': result += "-> typical_p "; break; - case 'p': result += "-> top_p "; break; - case 'm': result += "-> min_p "; break; - case 't': result += "-> temp "; break; - default : break; + for (auto sampler_type : params.samplers_sequence) { + const auto sampler_type_name = sampler_type_to_name_string(sampler_type); + if (!sampler_type_name.empty()) { + result += "-> " + sampler_type_name + " "; } } } else { @@ -125,25 +128,33 @@ static void sampler_queue( struct llama_context * ctx_main, const llama_sampling_params & params, llama_token_data_array & cur_p, - size_t & min_keep) { - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - + size_t min_keep) { const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float dynatemp_range = params.dynatemp_range; + const float dynatemp_exponent = params.dynatemp_exponent; + const int32_t top_k = params.top_k; const float top_p = params.top_p; const float min_p = params.min_p; const float tfs_z = params.tfs_z; const float typical_p = params.typical_p; - const std::string & samplers_sequence = params.samplers_sequence; + const std::vector & samplers_sequence = params.samplers_sequence; - for (auto s : samplers_sequence) { - switch (s){ - case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; - case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; - case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; - case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; - case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; - case 't': llama_sample_temp (ctx_main, &cur_p, temp); break; + for (auto sampler_type : samplers_sequence) { + switch (sampler_type) { + case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break; + case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break; + case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break; + case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break; + case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break; + case llama_sampler_type::TEMPERATURE: + if (dynatemp_range > 0) { + float dynatemp_min = std::max(0.0f, temp - dynatemp_range); + float dynatemp_max = std::max(0.0f, temp + dynatemp_range); + llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent); + } else { + llama_sample_temp(ctx_main, &cur_p, temp); + } + break; default : break; } } @@ -157,76 +168,20 @@ static llama_token llama_sampling_sample_impl( bool is_resampling) { // Add a parameter to indicate if we are resampling const llama_sampling_params & params = ctx_sampling->params; - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; - const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; - const float penalty_repeat = params.penalty_repeat; - const float penalty_freq = params.penalty_freq; - const float penalty_present = params.penalty_present; const int mirostat = params.mirostat; const float mirostat_tau = params.mirostat_tau; const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - - auto & prev = ctx_sampling->prev; - auto & cur = ctx_sampling->cur; + std::vector original_logits; + auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits); + if (!is_resampling) { + GGML_ASSERT(!original_logits.empty()); + } llama_token id = 0; - // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); - // Declare original_logits at the beginning of the function scope - std::vector original_logits; - - if (!is_resampling) { - // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this. - original_logits = std::vector(logits, logits + llama_n_vocab(llama_get_model(ctx_main))); - } - - // apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - cur.clear(); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array cur_p = { cur.data(), cur.size(), false }; - - if (ctx_cfg) { - llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale); - } - - // apply penalties - const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; - const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); - if (penalty_tokens_used_size) { - const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; - - llama_sample_repetition_penalties(ctx_main, &cur_p, - penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, - penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); - - if (!penalize_nl) { - for (size_t idx = 0; idx < cur_p.size; idx++) { - if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { - cur_p.data[idx].logit = nl_logit; - break; - } - } - } - } - - // If we are in the resampling phase, apply grammar checks before sampling logic - if (is_resampling && ctx_sampling->grammar != NULL) { - llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); - } - if (temp < 0.0) { // greedy sampling, with probs llama_sample_softmax(ctx_main, &cur_p); @@ -244,7 +199,7 @@ static llama_token llama_sampling_sample_impl( id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu); } else { // temperature sampling - size_t min_keep = std::max(1, params.n_probs); + size_t min_keep = std::max(1, params.min_keep); sampler_queue(ctx_main, params, cur_p, min_keep); @@ -261,7 +216,7 @@ static llama_token llama_sampling_sample_impl( // } //} - LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); + //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str()); } } @@ -290,6 +245,81 @@ static llama_token llama_sampling_sample_impl( return id; } +static llama_token_data_array llama_sampling_prepare_impl( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx, + bool apply_grammar, + std::vector * original_logits) { + const llama_sampling_params & params = ctx_sampling->params; + + const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); + + const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; + const float penalty_repeat = params.penalty_repeat; + const float penalty_freq = params.penalty_freq; + const float penalty_present = params.penalty_present; + + const bool penalize_nl = params.penalize_nl; + + auto & prev = ctx_sampling->prev; + auto & cur = ctx_sampling->cur; + + // Get a pointer to the logits + float * logits = llama_get_logits_ith(ctx_main, idx); + + if (apply_grammar && original_logits != NULL) { + // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. + *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; + } + + // apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + if (ctx_cfg) { + float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); + llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); + } + + cur.clear(); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array cur_p = { cur.data(), cur.size(), false }; + + // apply penalties + const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; + const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); + if (penalty_tokens_used_size) { + const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; + + llama_sample_repetition_penalties(ctx_main, &cur_p, + penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, + penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); + + if (!penalize_nl) { + for (size_t idx = 0; idx < cur_p.size; idx++) { + if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { + cur_p.data[idx].logit = nl_logit; + break; + } + } + } + } + + // apply grammar checks before sampling logic + if (apply_grammar && ctx_sampling->grammar != NULL) { + llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); + } + + return cur_p; +} + llama_token llama_sampling_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, @@ -299,6 +329,16 @@ llama_token llama_sampling_sample( return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); } +llama_token_data_array llama_sampling_prepare( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + const int idx, + bool apply_grammar, + std::vector * original_logits) { + return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits); +} + void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, diff --git a/common/sampling.h b/common/sampling.h index f16ef97e3..56ed991b8 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -8,25 +8,46 @@ #include #include +// sampler types +enum class llama_sampler_type : char { + TOP_K = 'k', + TOP_P = 'p', + MIN_P = 'm', + TFS_Z = 'f', + TYPICAL_P = 'y', + TEMPERATURE = 't' +}; + // sampling parameters typedef struct llama_sampling_params { int32_t n_prev = 64; // number of previous tokens to remember int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled float tfs_z = 1.00f; // 1.0 = disabled float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled float penalty_present = 0.00f; // 0.0 = disabled int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token - std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp + bool penalize_nl = false; // consider newlines as a repeatable token + + std::vector samplers_sequence = { + llama_sampler_type::TOP_K, + llama_sampler_type::TFS_Z, + llama_sampler_type::TYPICAL_P, + llama_sampler_type::TOP_P, + llama_sampler_type::MIN_P, + llama_sampler_type::TEMPERATURE + }; std::string grammar; // optional BNF-like grammar to constrain sampling @@ -110,6 +131,15 @@ llama_token llama_sampling_sample( struct llama_context * ctx_cfg, int idx = 0); +// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters. +llama_token_data_array llama_sampling_prepare( + struct llama_sampling_context * ctx_sampling, + struct llama_context * ctx_main, + struct llama_context * ctx_cfg, + int idx = 0, + bool apply_grammar = true, + std::vector * original_logits = nullptr); + void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, diff --git a/common/train.cpp b/common/train.cpp index e6f2f7a2f..0dbfd24df 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -31,7 +31,7 @@ struct train_state * init_train_state() { state->opt = new struct ggml_opt_context; state->opt->ctx = NULL; - state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; state->opt->loss_after = 0.0f; @@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g std::string opt_type; GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_ADAM; + opt->params.type = GGML_OPT_TYPE_ADAM; GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); @@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; + opt->params.type = GGML_OPT_TYPE_LBFGS; GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); @@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); @@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_add_tensor(fctx, opt->adam.pf); } } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); @@ -1363,12 +1363,12 @@ bool consume_common_train_arg( *invalid_param = true; return true; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params->n_gpu_layers = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); -#endif + if (llama_supports_gpu_offload()) { + params->n_gpu_layers = std::stoi(argv[i]); + } else { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } } else if (arg == "-h" || arg == "--help") { params->print_usage = true; return true; diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 203eaf64b..7e601170e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -8,9 +8,10 @@ import json import os import re import sys +from abc import ABC, abstractmethod from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast import numpy as np import torch @@ -22,6 +23,8 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +from convert import LlamaHfVocab, permute + ###### MODEL DEFINITIONS ###### @@ -34,7 +37,12 @@ class SentencePieceTokenTypes(IntEnum): BYTE = 6 -class Model: +AnyModel = TypeVar("AnyModel", bound="type[Model]") + + +class Model(ABC): + _model_classes: dict[str, type[Model]] = {} + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): self.dir_model = dir_model self.ftype = ftype @@ -45,8 +53,21 @@ class Model: self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) - self.model_arch = self._get_model_architecture() self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) + self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + + @property + @abstractmethod + def model_arch(self) -> gguf.MODEL_ARCH: + pass + + def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") def set_vocab(self): self._set_vocab_gpt2() @@ -68,28 +89,46 @@ class Model: def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get( - "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - )) - if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: + self.gguf_writer.add_block_count(self.block_count) + + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) - if (n_embd := self.hparams.get("hidden_size")) is not None: - self.gguf_writer.add_embedding_length(n_embd) - if (n_ff := self.hparams.get("intermediate_size")) is not None: + print(f"gguf: context length = {n_ctx}") + + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + self.gguf_writer.add_embedding_length(n_embd) + print(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) - if (n_head := self.hparams.get("num_attention_heads")) is not None: - self.gguf_writer.add_head_count(n_head) + print(f"gguf: feed forward length = {n_ff}") + + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + self.gguf_writer.add_head_count(n_head) + print(f"gguf: head count = {n_head}") + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) + print(f"gguf: key-value head count = {n_head_kv}") - if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps) + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + print(f"gguf: rope theta = {rope_theta}") + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + print(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + print(f"gguf: layer norm epsilon = {f_norm_eps}") if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) + print(f"gguf: expert count = {n_experts}") if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) + print(f"gguf: experts used count = {n_experts_used}") - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_file_type(self.ftype) + print(f"gguf: file type = {self.ftype}") def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) @@ -158,37 +197,22 @@ class Model: with open(dir_model / "config.json", "r", encoding="utf-8") as f: return json.load(f) - @staticmethod - def from_model_architecture(model_architecture): - if model_architecture == "GPTNeoXForCausalLM": - return GPTNeoXModel - if model_architecture == "BloomForCausalLM": - return BloomModel - if model_architecture == "MPTForCausalLM": - return MPTModel - if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return BaichuanModel - if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): - return FalconModel - if model_architecture == "GPTBigCodeForCausalLM": - return StarCoderModel - if model_architecture == "GPTRefactForCausalLM": - return RefactModel - if model_architecture == "PersimmonForCausalLM": - return PersimmonModel - if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): - return StableLMModel - if model_architecture == "QWenLMHeadModel": - return QwenModel - if model_architecture == "MixtralForCausalLM": - return MixtralModel - if model_architecture == "GPT2LMHeadModel": - return GPT2Model - if model_architecture == "PhiForCausalLM": - return Phi2Model - if model_architecture == "PlamoForCausalLM": - return PlamoModel - return Model + @classmethod + def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: + assert names + + def func(modelcls: type[Model]): + for name in names: + cls._model_classes[name] = modelcls + return modelcls + return func + + @classmethod + def from_model_architecture(cls, arch): + try: + return cls._model_classes[arch] + except KeyError: + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None def _is_model_safetensors(self) -> bool: return Model.count_model_parts(self.dir_model, ".safetensors") > 0 @@ -203,43 +227,10 @@ class Model: return ("pytorch_model.bin",) return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - def _get_model_architecture(self) -> gguf.MODEL_ARCH: - arch = self.hparams["architectures"][0] - if arch == "GPTNeoXForCausalLM": - return gguf.MODEL_ARCH.GPTNEOX - if arch == "BloomForCausalLM": - return gguf.MODEL_ARCH.BLOOM - if arch == "MPTForCausalLM": - return gguf.MODEL_ARCH.MPT - if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return gguf.MODEL_ARCH.BAICHUAN - if arch in ("FalconForCausalLM", "RWForCausalLM"): - return gguf.MODEL_ARCH.FALCON - if arch == "GPTBigCodeForCausalLM": - return gguf.MODEL_ARCH.STARCODER - if arch == "GPTRefactForCausalLM": - return gguf.MODEL_ARCH.REFACT - if arch == "PersimmonForCausalLM": - return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): - return gguf.MODEL_ARCH.STABLELM - if arch == "QWenLMHeadModel": - return gguf.MODEL_ARCH.QWEN - if arch == "MixtralForCausalLM": - return gguf.MODEL_ARCH.LLAMA - if arch == "GPT2LMHeadModel": - return gguf.MODEL_ARCH.GPT2 - if arch == "PhiForCausalLM": - return gguf.MODEL_ARCH.PHI2 - if arch == "PlamoForCausalLM": - return gguf.MODEL_ARCH.PLAMO - - raise NotImplementedError(f'Architecture "{arch}" not supported!') - def _set_vocab_gpt2(self): dir_model = self.dir_model hparams = self.hparams - tokens: list[bytearray] = [] + tokens: list[str] = [] toktypes: list[int] = [] from transformers import AutoTokenizer @@ -252,8 +243,7 @@ class Model: for i in range(vocab_size): if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode('utf-8') - tokens.append(bytearray(pad_token)) + tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) @@ -272,6 +262,57 @@ class Model: special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_qwen(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) + vocab_size = hparams["vocab_size"] + assert max(tokenizer.get_vocab().values()) < vocab_size + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + assert len(merged) == 2 + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined + added_vocab = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) + special_vocab.merges = merges + # only add special tokens when they were not already loaded from config.json + if len(special_vocab.special_token_ids) == 0: + special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) + # this one is usually not in config.json anyway + special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_sentencepiece(self): from sentencepiece import SentencePieceProcessor @@ -282,13 +323,12 @@ class Model: toktypes: list[int] = [] if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) + raise FileNotFoundError(f"File not found: {tokenizer_path}") tokenizer = SentencePieceProcessor(str(tokenizer_path)) vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - for token_id in range(vocab_size): + for token_id in range(tokenizer.vocab_size()): piece = tokenizer.id_to_piece(token_id) text = piece.encode("utf-8") score = tokenizer.get_score(token_id) @@ -313,9 +353,34 @@ class Model: added_tokens_json = json.load(f) for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + key = key.encode("utf-8") + if key not in tokens: + tokens.append(key) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + assert len(tokens) == vocab_size + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_llama_hf(self): + vocab = LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_token_list(tokens) @@ -326,7 +391,10 @@ class Model: special_vocab.add_to_gguf(self.gguf_writer) +@Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): + model_arch = gguf.MODEL_ARCH.GPTNEOX + def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -343,7 +411,10 @@ class GPTNeoXModel(Model): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) +@Model.register("BloomForCausalLM") class BloomModel(Model): + model_arch = gguf.MODEL_ARCH.BLOOM + def set_gguf_parameters(self): self.gguf_writer.add_name("Bloom") n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -435,7 +506,21 @@ class BloomModel(Model): print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") +@Model.register("MPTForCausalLM") class MPTModel(Model): + model_arch = gguf.MODEL_ARCH.MPT + + def set_vocab(self): + try: + self._set_vocab_gpt2() + except Exception: + # Fallback for SEA-LION model + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_pad_token_id(3) + self.gguf_writer.add_eos_token_id(1) + self.gguf_writer.add_unk_token_id(0) + def set_gguf_parameters(self): block_count = self.hparams["n_layers"] self.gguf_writer.add_name(self.dir_model.name) @@ -449,7 +534,10 @@ class MPTModel(Model): self.gguf_writer.add_layer_norm_eps(1e-5) if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + if self.hparams["attn_config"]["alibi"]: + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + else: + self.gguf_writer.add_max_alibi_bias(0.0) def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) @@ -470,7 +558,8 @@ class MPTModel(Model): # map tensor names if "scales" in name: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) - new_name = new_name.replace("scales", "act.scales") + if new_name is not None: + new_name = new_name.replace("scales", "act.scales") else: new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: @@ -496,13 +585,93 @@ class MPTModel(Model): self.gguf_writer.add_tensor(new_name, data) - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - self.gguf_writer.add_tensor("output.weight", data) + +@Model.register("OrionForCausalLM") +class OrionModel(Model): + model_arch = gguf.MODEL_ARCH.ORION + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 + self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) +@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") class BaichuanModel(Model): + model_arch = gguf.MODEL_ARCH.BAICHUAN + def set_vocab(self): self._set_vocab_sentencepiece() @@ -617,7 +786,152 @@ class BaichuanModel(Model): return weights[r * n_part:r * n_part + r, ...] +@Model.register("XverseForCausalLM") +class XverseModel(Model): + model_arch = gguf.MODEL_ARCH.XVERSE + + def set_vocab(self): + assert (self.dir_model / "tokenizer.json").is_file() + dir_model = self.dir_model + hparams = self.hparams + + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for token_id in range(vocab_size): + token_text = reverse_vocab[token_id].encode('utf-8') + # replace "\x00" to string with length > 0 + if token_text == b"\x00": + toktype = gguf.TokenType.BYTE # special + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + toktype = gguf.TokenType.BYTE # special + elif reverse_vocab[token_id] in added_vocab: + if tokenizer.added_tokens_decoder[token_id].special: + toktype = gguf.TokenType.CONTROL + else: + toktype = gguf.TokenType.USER_DEFINED + else: + toktype = gguf.TokenType.NORMAL + + tokens.append(token_text) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + +@Model.register("FalconForCausalLM", "RWForCausalLM") class FalconModel(Model): + model_arch = gguf.MODEL_ARCH.FALCON + def set_gguf_parameters(self): block_count = self.hparams.get("num_hidden_layers") if block_count is None: @@ -710,7 +1024,10 @@ class FalconModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("GPTBigCodeForCausalLM") class StarCoderModel(Model): + model_arch = gguf.MODEL_ARCH.STARCODER + def set_gguf_parameters(self): block_count = self.hparams["n_layer"] @@ -725,7 +1042,10 @@ class StarCoderModel(Model): self.gguf_writer.add_file_type(self.ftype) +@Model.register("GPTRefactForCausalLM") class RefactModel(Model): + model_arch = gguf.MODEL_ARCH.REFACT + def set_gguf_parameters(self): hidden_dim = self.hparams["n_embd"] inner_dim = 4 * hidden_dim @@ -809,7 +1129,10 @@ class RefactModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("PersimmonForCausalLM") class PersimmonModel(Model): + model_arch = gguf.MODEL_ARCH.PERSIMMON + def set_gguf_parameters(self): block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) head_count = self.hparams["num_attention_heads"] @@ -817,15 +1140,21 @@ class PersimmonModel(Model): hidden_size = self.hparams["hidden_size"] self.gguf_writer.add_name('persimmon-8b-chat') + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + + # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller + # than the head size? + # ref: https://github.com/ggerganov/llama.cpp/pull/4889 + # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) + self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) def set_vocab(self): self._set_vocab_sentencepiece() @@ -851,7 +1180,17 @@ class PersimmonModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") class StableLMModel(Model): + model_arch = gguf.MODEL_ARCH.STABLELM + + def set_vocab(self): + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab + self._set_vocab_qwen() + def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] @@ -861,18 +1200,314 @@ class StableLMModel(Model): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) -class MixtralModel(Model): +@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +class LlamaModel(Model): + model_arch = gguf.MODEL_ARCH.LLAMA + + def set_vocab(self): + try: + self. _set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_llama_hf() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + # Same as super class, but permuting q_proj, k_proj + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + n_head = self.hparams.get("num_attention_heads") + n_kv_head = self.hparams.get("num_key_value_heads") + n_experts = self.hparams.get("num_local_experts") + experts = dict() + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.numpy() + + if name.endswith("q_proj.weight"): + data = permute(data, n_head, n_head) + if name.endswith("k_proj.weight"): + data = permute(data, n_head, n_kv_head) + + data = data.squeeze() + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + experts[name] = data + if len(experts) >= n_experts: + # merge the experts into a single 3d tensor + for bid in range(block_count): + for wid in range(1, 4): + full = True + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight" + if ename not in experts: + full = False + break + if not full: + continue + + datas = [] + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight" + datas.append(experts[ename]) + del experts[ename] + + data = np.stack(datas, axis=0) + data_dtype = data.dtype + + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + if self.ftype == 1 and data_dtype == np.float32: + data = data.astype(np.float16) + + merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight" + + new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + continue + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # 1d tensors need to be converted to float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts.keys()}") + + +@Model.register("GrokForCausalLM") +class GrokModel(Model): + model_arch = gguf.MODEL_ARCH.GROK + def set_vocab(self): self._set_vocab_sentencepiece() + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_name("Grok") + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + n_experts = self.hparams.get("num_local_experts") + experts = dict() + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # process the experts separately + if name.find(".moe.") != -1: + experts[name] = data + if len(experts) >= n_experts: + # merge the experts into a single 3d tensor + for bid in range(block_count): + for wid in ["linear", "linear_1", "linear_v"]: + full = True + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" + if ename not in experts: + full = False + break + if not full: + continue + + datas = [] + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" + datas.append(experts[ename]) + del experts[ename] + + data = np.stack(datas, axis=0) + data_dtype = data.dtype + + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + if self.ftype == 1 and data_dtype == np.float32: + data = data.astype(np.float16) + + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" + + new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + continue + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +@Model.register("MiniCPMForCausalLM") +class MiniCPMModel(Model): + model_arch = gguf.MODEL_ARCH.MINICPM + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + self.gguf_writer.add_name("MiniCPM") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_file_type(self.ftype) + + def set_vocab(self): + self._set_vocab_llama_hf() + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + n_head = self.hparams.get("num_attention_heads") + n_kv_head = self.hparams.get("num_key_value_heads") + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +@Model.register("QWenLMHeadModel") class QwenModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN + @staticmethod def token_bytes_to_string(b): from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode @@ -880,7 +1515,7 @@ class QwenModel(Model): return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')]) @staticmethod - def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]: + def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None @@ -897,52 +1532,7 @@ class QwenModel(Model): return parts def set_vocab(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) - vocab_size = hparams["vocab_size"] - assert max(tokenizer.get_vocab().values()) < vocab_size - - merges = [] - vocab = {} - mergeable_ranks = tokenizer.mergeable_ranks - for token, rank in mergeable_ranks.items(): - vocab[self.token_bytes_to_string(token)] = rank - if len(token) == 1: - continue - merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - assert len(merged) == 2 - merges.append(' '.join(map(self.token_bytes_to_string, merged))) - - reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.special_tokens - - for i in range(vocab_size): - if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode("utf-8") - tokens.append(bytearray(pad_token)) - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.CONTROL) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) - special_vocab.merges = merges - special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"]) - special_vocab.add_to_gguf(self.gguf_writer) + self._set_vocab_qwen() def set_gguf_parameters(self): self.gguf_writer.add_name("Qwen") @@ -997,7 +1587,15 @@ class QwenModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("Qwen2ForCausalLM") +class Qwen2Model(Model): + model_arch = gguf.MODEL_ARCH.QWEN2 + + +@Model.register("GPT2LMHeadModel") class GPT2Model(Model): + model_arch = gguf.MODEL_ARCH.GPT2 + def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) @@ -1014,7 +1612,7 @@ class GPT2Model(Model): for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): continue if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): @@ -1059,24 +1657,35 @@ class GPT2Model(Model): self.gguf_writer.add_tensor("output.weight", data) +@Model.register("PhiForCausalLM") class Phi2Model(Model): + model_arch = gguf.MODEL_ARCH.PHI2 + def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + rot_pct = self.find_hparam(["partial_rotary_factor"]) + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_name("Phi2") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(4 * n_embd) self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(self.hparams["n_head"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"])) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_add_bos_token(False) +@Model.register("PlamoForCausalLM") class PlamoModel(Model): + model_arch = gguf.MODEL_ARCH.PLAMO + def set_vocab(self): self._set_vocab_sentencepiece() @@ -1155,6 +1764,594 @@ class PlamoModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("CodeShellForCausalLM") +class CodeShellModel(Model): + model_arch = gguf.MODEL_ARCH.CODESHELL + + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("CodeShell") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(10000.0) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(1.0) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + tensors = dict(self.get_tensors()) + has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() + for name, data_torch in tensors.items(): + # we don't need these + if name.endswith((".attn.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "transformer.wte.weight": + self.gguf_writer.add_tensor("output.weight", data) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + +@Model.register("InternLM2ForCausalLM") +class InternLM2Model(Model): + model_arch = gguf.MODEL_ARCH.INTERNLM2 + + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + from sentencepiece import sentencepiece_model_pb2 as model + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + sentencepiece_model = model.ModelProto() + sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) + add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + print(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉" + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_add_space_prefix(add_prefix) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + old_eos = special_vocab.special_token_ids["eos"] + if "chat" in os.path.basename(self.dir_model.absolute()): + # For the chat model, we replace the eos with '<|im_end|>'. + special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) + print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ +in chat mode so that the conversation can end normally.") + + special_vocab.add_to_gguf(self.gguf_writer) + + def _try_get_sft_eos(self, tokenizer): + unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.encode('<|im_end|>') + assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) + if len(unused_145_list) == 1: + eos_token = unused_145_list[0] + if len(im_end_list) == 1: + eos_token = im_end_list[0] + return eos_token + + def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("InternLM2") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + + def post_write_tensors(self, tensor_map, name, data_torch): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def write_tensors(self): + from einops import rearrange + + num_heads = self.hparams.get("num_attention_heads") + num_kv_heads = self.hparams.get("num_key_value_heads") + hidden_size = self.hparams.get("hidden_size") + q_per_kv = num_heads // num_kv_heads + head_dim = hidden_size // num_heads + num_groups = num_heads // q_per_kv + + block_count = self.hparams["num_hidden_layers"] + model_kv = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + if re.match(qkv_pattern, name): + bid = re.findall(qkv_pattern, name)[0] + qkv = data_torch + qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) + q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + # The model weights of q and k equire additional reshape. + q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) + v = rearrange(v, " o g n i -> o (g n i)").T + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) + else: + self.post_write_tensors(tensor_map, name, data_torch) + + +@Model.register("BertModel", "CamembertModel") +class BertModel(Model): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.vocab_size = None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_causal_attention(False) + + # get pooling path + pooling_path = None + module_path = self.dir_model / "modules.json" + if module_path.is_file(): + with open(module_path, encoding="utf-8") as f: + modules = json.load(f) + for mod in modules: + if mod["type"] == "sentence_transformers.models.Pooling": + pooling_path = mod["path"] + break + + # get pooling type + if pooling_path is not None: + with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f: + pooling = json.load(f) + if pooling["pooling_mode_mean_tokens"]: + pooling_type = gguf.PoolingType.MEAN + elif pooling["pooling_mode_cls_token"]: + pooling_type = gguf.PoolingType.CLS + else: + raise NotImplementedError("Only MEAN and CLS pooling types supported") + self.gguf_writer.add_pooling_type(pooling_type) + + def set_vocab(self): + # use huggingface vocab to get all tokens + vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True) + tokens, scores, toktypes = zip(*vocab.all_tokens()) + assert len(tokens) == vocab.vocab_size + self.vocab_size = vocab.vocab_size + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + n_token_types = len(set(toktypes)) + self.gguf_writer.add_token_type_count(n_token_types) + + # convert to phantom space vocab + def phantom(tok, typ): + if tok.startswith(b"[") and tok.endswith(b"]"): + return tok + if tok.startswith(b"##"): + return tok[2:] + return b"\xe2\x96\x81" + tok + tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes)) + + # set up bos and eos tokens (cls and sep) + self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id) + self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id) + + # add vocab to gguf + self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + # handle special tokens + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def write_tensors(self): + tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + tensors = dict(self.get_tensors()) + for name, data_torch in tensors.items(): + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + continue # we don't need these + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + data = data_torch.squeeze().numpy() + n_dims = len(data.shape) + new_dtype: type[np.floating[Any]] + + if ( + self.ftype == 1 and name.endswith(".weight") and n_dims == 2 + and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32 + ): + # if f16 desired, convert any float32 2-dim weight tensors to float16 + new_dtype = np.float16 + else: + # if f32 desired, convert any float16 to float32 + new_dtype = np.float32 + + print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") + + if data.dtype != new_dtype: + data = data.astype(new_dtype) + + self.gguf_writer.add_tensor(new_name, data) + + +@Model.register("NomicBertModel") +class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.NOMIC_BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors + assert self.hparams["qkv_proj_bias"] is False + assert self.hparams["mlp_fc1_bias"] is False + assert self.hparams["mlp_fc2_bias"] is False + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + + def get_tensors(self): + assert self.vocab_size is not None + for name, data in super().get_tensors(): + # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly. + if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size: + rounded_vocab_size = (self.vocab_size + 63) // 64 * 64 + assert data.shape == (rounded_vocab_size, self.hparams["n_embd"]) + data = data[:self.vocab_size, :] + yield name, data + + +@Model.register("GemmaForCausalLM") +class GemmaModel(Model): + model_arch = gguf.MODEL_ARCH.GEMMA + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(hparams["head_dim"]) + self.gguf_writer.add_value_length(hparams["head_dim"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +@Model.register("Starcoder2ForCausalLM") +class StarCoder2Model(Model): + model_arch = gguf.MODEL_ARCH.STARCODER2 + + +@Model.register("MambaForCausalLM", "MambaLMHeadModel") +class MambaModel(Model): + model_arch = gguf.MODEL_ARCH.MAMBA + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 8 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" + print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + neox_reader = gguf.GGUFReader(tokenizer_path, "r") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) + field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + + # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams["n_layer"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + tok_embd = None + tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight" + output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight" + + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + if name.endswith(".A_log"): + print("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + # assuming token_embd.weight is seen before output.weight + if tok_embd is not None and new_name == output_name: + if torch.equal(tok_embd, data_torch): + print(f"{output_name} is equivalent to {tok_embd_name}, omitting") + continue + if new_name == tok_embd_name: + tok_embd = data_torch + + data = data_torch.squeeze().numpy() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert big float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +@Model.register("CohereForCausalLM") +class CommandR2Model(Model): + model_arch = gguf.MODEL_ARCH.COMMAND_R + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + ###### CONVERSION LOGIC ###### @@ -1192,7 +2389,7 @@ def main() -> None: if args.awq_path: sys.path.insert(1, str(Path(__file__).parent / 'awq-py')) - from awq.apply_awq import add_scale_weights + from awq.apply_awq import add_scale_weights # type: ignore[import-not-found] tmp_model_path = args.model / "weighted_model" dir_model = tmp_model_path if tmp_model_path.is_dir(): diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index e359330af..cd9644fcb 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -2,6 +2,7 @@ from __future__ import annotations import argparse +import os import struct import sys from enum import IntEnum @@ -9,7 +10,6 @@ from pathlib import Path import numpy as np -import os if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf @@ -371,15 +371,11 @@ def handle_metadata(cfg, hp): params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path) else: raise ValueError('Unable to load metadata') - vocab = convert.load_vocab( - cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, - cfg.vocabtype) - # FIXME: Respect cfg.vocab_dir? - svocab = gguf.SpecialVocab(cfg.model_metadata_dir, - load_merges = cfg.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir) + vocab_factory = convert.VocabFactory(vocab_path) + vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir) convert.check_vocab_size(params, vocab) - return (params, vocab, svocab) + return params, vocab, special_vocab def handle_args(): @@ -402,8 +398,8 @@ def handle_args(): help ='Load HuggingFace/.pth vocab and metadata from the specified directory') parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") - parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", - help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") + parser.add_argument("--vocabtype", default="spm,hfft", + help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)") return parser.parse_args() diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 35ce152f4..9a9936dec 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -5,17 +5,16 @@ import json import os import struct import sys +from pathlib import Path from typing import Any, BinaryIO, Sequence import numpy as np import torch -from pathlib import Path if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) import gguf - NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} @@ -60,7 +59,14 @@ if __name__ == '__main__': input_model = os.path.join(sys.argv[1], "adapter_model.bin") output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") - model = torch.load(input_model, map_location="cpu") + if os.path.exists(input_model): + model = torch.load(input_model, map_location="cpu") + else: + input_model = os.path.join(sys.argv[1], "adapter_model.safetensors") + # lazy import load_file only if lora is in safetensors format. + from safetensors.torch import load_file + model = load_file(input_model, device="cpu") + arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" if arch_name not in gguf.MODEL_ARCH_NAMES.values(): diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 1ba5864dc..ccb99279e 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -import torch -import os -from pprint import pprint -import sys import argparse +import os +import sys from pathlib import Path +from pprint import pprint + +import torch from sentencepiece import SentencePieceProcessor + if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf @@ -69,7 +71,7 @@ def main(): persimmon_model = torch.load(args.ckpt_path) hparams = persimmon_model['args'] pprint(hparams) - tensors = {} + tensors: dict[str, torch.Tensor] = {} _flatten_dict(persimmon_model['model'], tensors, None) arch = gguf.MODEL_ARCH.PERSIMMON @@ -86,7 +88,8 @@ def main(): gguf_writer.add_embedding_length(hidden_size) gguf_writer.add_block_count(block_count) gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size) - gguf_writer.add_rope_dimension_count(hidden_size // head_count) + # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443 + gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) gguf_writer.add_head_count(head_count) gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_rope_freq_base(hparams.rotary_emb_base) @@ -103,12 +106,12 @@ def main(): tensor_map = gguf.get_tensor_name_map(arch, block_count) print(tensor_map) for name in tensors.keys(): - data = tensors[name] + data_torch = tensors[name] if name.endswith(".self_attention.rotary_emb.inv_freq"): continue - old_dtype = data.dtype + old_dtype = data_torch.dtype # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) - data = data.to(torch.float32).squeeze().numpy() + data = data_torch.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: print("Can not map tensor '" + name + "'") diff --git a/convert.py b/convert.py index 70fe55e43..d212776ff 100755 --- a/convert.py +++ b/convert.py @@ -16,69 +16,42 @@ import re import signal import struct import sys +import textwrap import time -import warnings import zipfile -from abc import ABCMeta, abstractmethod -from argparse import ArgumentParser +from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import ( - IO, - TYPE_CHECKING, - Any, - Callable, - Iterable, - Literal, - Optional, - Tuple, - TypeVar, -) +from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable import numpy as np from sentencepiece import SentencePieceProcessor -try: - from transformers import AutoTokenizer -except ModuleNotFoundError as e: - warnings.warn(f"Could not import AutoTokenizer from transformers: {e}") +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) +import gguf -# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory -if "NO_LOCAL_GGUF" not in os.environ: - # Use absolute path to the gguf-py directory - gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py") - print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed - if gguf_py_dir not in sys.path: - sys.path.insert(1, gguf_py_dir) +if TYPE_CHECKING: + from typing import TypeAlias -# Import gguf module -try: - import gguf -except ModuleNotFoundError as e: - print(f"Could not import gguf: {e}") - sys.exit(1) - -if TYPE_CHECKING: # NOTE: This isn't necessary. - from typing import TypeAlias # This can technically be omitted. - -if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"): +if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) -# NOTE: n-dimensional arrays should be directly referenced -NDArray: TypeAlias = "np.ndarray[Any, Any]" +NDArray: TypeAlias = 'np.ndarray[Any, Any]' -# Why is this here? LLAMA and GPT are technically the only compatible ARCHs. ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 +ADDED_TOKENS_FILE = 'added_tokens.json' +FAST_TOKENIZER_FILE = 'tokenizer.json' + # # data types # -# TODO: Clean up and refactor data types @dataclass(frozen=True) class DataType: name: str @@ -183,86 +156,69 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { @dataclass class Params: - n_vocab: int - n_embd: int - n_layer: int - n_ctx: int - n_ff: int - n_head: int - n_head_kv: int - f_norm_eps: Optional[float] = None - n_experts: Optional[int] = None - n_experts_used: Optional[int] = None + n_vocab: int + n_embd: int + n_layer: int + n_ctx: int + n_ff: int + n_head: int + n_head_kv: int + n_experts: int | None = None + n_experts_used: int | None = None + f_norm_eps: float | None = None - rope_scaling_type: Optional[gguf.RopeScalingType] = None - f_rope_freq_base: Optional[float] = None - f_rope_scale: Optional[float] = None - n_orig_ctx: Optional[int] = None - rope_finetuned: Optional[bool] = None + rope_scaling_type: gguf.RopeScalingType | None = None + f_rope_freq_base: float | None = None + f_rope_scale: float | None = None + n_orig_ctx: int | None = None + rope_finetuned: bool | None = None - ftype: Optional[GGMLFileType] = None + ftype: GGMLFileType | None = None # path to the directory containing the model files - path_model: Optional[Path] = None + path_model: Path | None = None @staticmethod - def guessed(model: LazyModel) -> "Params": + def guessed(model: LazyModel) -> Params: # try transformer naming first - n_vocab, n_embd = ( - model["model.embed_tokens.weight"].shape - if "model.embed_tokens.weight" in model - else model["tok_embeddings.weight"].shape - ) + n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer = next( - i - for i in itertools.count() - if f"model.layers.{i}.self_attn.q_proj.weight" not in model - ) - elif ( - "model.layers.0.self_attn.W_pack.weight" in model - ): # next: try baichuan naming - n_layer = next( - i - for i in itertools.count() - if f"model.layers.{i}.self_attn.W_pack.weight" not in model - ) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: - n_layer = next( - i - for i in itertools.count() - if f"layers.{i}.attention.wq.weight" not in model - ) + n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: - raise Exception( - "failed to guess 'n_layer'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files." - ) + msg = """\ + failed to guess 'n_layer'. This model is unknown or unsupported. + Suggestion: provide 'config.json' of the model in the same directory containing model files.""" + raise KeyError(textwrap.dedent(msg)) - n_head = n_embd // 128 # guessed - n_mult = 256 # guessed + n_head = n_embd // 128 # guessed + n_mult = 256 # guessed # TODO: verify this n_ff = int(2 * (4 * n_embd) / 3) n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult) return Params( - n_vocab=n_vocab, - n_embd=n_embd, - n_layer=n_layer, - n_ctx=-1, - n_ff=n_ff, - n_head=n_head, - n_head_kv=n_head, - f_norm_eps=1e-5, + n_vocab = n_vocab, + n_embd = n_embd, + n_layer = n_layer, + n_ctx = -1, + n_ff = n_ff, + n_head = n_head, + n_head_kv = n_head, + f_norm_eps = 1e-5, ) @staticmethod - def load_transformers_config(model: LazyModel, config_path: Path) -> "Params": - config = json.load(open(config_path)) + def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: + with open(config_path) as f: + config = json.load(f) rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None rope_scaling = config.get("rope_scaling") @@ -274,22 +230,22 @@ class Params: rope_scaling_type = gguf.RopeScalingType.LINEAR elif typ == "yarn": rope_scaling_type = gguf.RopeScalingType.YARN - n_orig_ctx = rope_scaling["original_max_position_embeddings"] - rope_finetuned = rope_scaling["finetuned"] + n_orig_ctx = rope_scaling['original_max_position_embeddings'] + rope_finetuned = rope_scaling['finetuned'] else: - raise NotImplementedError(f"Unknown rope scaling type: {typ}") + raise NotImplementedError(f'Unknown rope scaling type: {typ}') if "max_sequence_length" in config: n_ctx = config["max_sequence_length"] elif "max_position_embeddings" in config: n_ctx = config["max_position_embeddings"] else: - raise Exception( - "failed to guess 'n_ctx'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files." - ) + msg = """\ + failed to guess 'n_ctx'. This model is unknown or unsupported. + Suggestion: provide 'config.json' of the model in the same directory containing model files.""" + raise KeyError(textwrap.dedent(msg)) - n_experts = None + n_experts = None n_experts_used = None if "num_local_experts" in config: @@ -297,30 +253,31 @@ class Params: n_experts_used = config["num_experts_per_tok"] return Params( - n_vocab=config["vocab_size"], - n_embd=config["hidden_size"], - n_layer=config["num_hidden_layers"], - n_ctx=n_ctx, - n_ff=config["intermediate_size"], - n_head=(n_head := config["num_attention_heads"]), - n_head_kv=config.get("num_key_value_heads", n_head), - n_experts=n_experts, - n_experts_used=n_experts_used, - f_norm_eps=config["rms_norm_eps"], - f_rope_freq_base=config.get("rope_theta"), - rope_scaling_type=rope_scaling_type, - f_rope_scale=f_rope_scale, - n_orig_ctx=n_orig_ctx, - rope_finetuned=rope_finetuned, + n_vocab = config["vocab_size"], + n_embd = config["hidden_size"], + n_layer = config["num_hidden_layers"], + n_ctx = n_ctx, + n_ff = config["intermediate_size"], + n_head = (n_head := config["num_attention_heads"]), + n_head_kv = config.get("num_key_value_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["rms_norm_eps"], + f_rope_freq_base = config.get("rope_theta"), + rope_scaling_type = rope_scaling_type, + f_rope_scale = f_rope_scale, + n_orig_ctx = n_orig_ctx, + rope_finetuned = rope_finetuned, ) # LLaMA v2 70B params.json # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod - def load_torch_params(model: LazyModel, config_path: Path) -> "Params": - config = json.load(open(config_path)) + def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: + with open(config_path) as f: + config = json.load(f) - n_experts = None + n_experts = None n_experts_used = None f_rope_freq_base = None @@ -343,89 +300,132 @@ class Params: if config.get("moe"): n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] - n_experts = config["moe"]["num_experts"] + n_experts = config["moe"]["num_experts"] n_experts_used = config["moe"]["num_experts_per_tok"] f_rope_freq_base = 1e6 return Params( - n_vocab=config.get("vocab_size", model["tok_embeddings.weight"].shape[0]), - n_embd=config["dim"], - n_layer=config["n_layers"], - n_ctx=n_ctx, - n_ff=n_ff, - n_head=(n_head := config["n_heads"]), - n_head_kv=config.get("n_kv_heads", n_head), - n_experts=n_experts, - n_experts_used=n_experts_used, - f_norm_eps=config["norm_eps"], - f_rope_freq_base=config.get("rope_theta", f_rope_freq_base), + n_vocab = model["tok_embeddings.weight"].shape[0], + n_embd = config["dim"], + n_layer = config["n_layers"], + n_ctx = n_ctx, + n_ff = n_ff, + n_head = (n_head := config["n_heads"]), + n_head_kv = config.get("n_kv_heads", n_head), + n_experts = n_experts, + n_experts_used = n_experts_used, + f_norm_eps = config["norm_eps"], + f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), ) @staticmethod - def load(model_plus: ModelPlus) -> "Params": - hf_config_path = model_plus.paths[0].parent / "config.json" + def load(model_plus: ModelPlus) -> Params: + hf_config_path = model_plus.paths[0].parent / "config.json" orig_config_path = model_plus.paths[0].parent / "params.json" if hf_config_path.exists(): - params = Params.load_transformers_config(model_plus.model, hf_config_path) + params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) elif orig_config_path.exists(): - params = Params.load_torch_params(model_plus.model, orig_config_path) - elif model_plus.format != "none": + params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) + elif model_plus.format != 'none': params = Params.guessed(model_plus.model) else: - raise ValueError("Cannot guess params when model format is none") + raise ValueError('Cannot guess params when model format is none') params.path_model = model_plus.paths[0].parent return params -class BpeVocab: # GPT - def __init__( - self, fname_tokenizer: Path, fname_added_tokens: Optional[Path] - ) -> None: - self.bpe_tokenizer = json.loads( - open(str(fname_tokenizer), encoding="utf-8").read() - ) - added_tokens: dict[str, int] - if fname_added_tokens is not None: - # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - # Fall back to trying to find the added tokens in tokenizer.json - tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json" - if not tokenizer_json_file.is_file(): - added_tokens = {} - else: - tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) - added_tokens = dict( - (item["content"], item["id"]) - for item in tokenizer_json.get("added_tokens", []) - # Added tokens here can be duplicates of the main vocabulary. - if item["content"] not in self.bpe_tokenizer - ) +# +# vocab +# - vocab_size: int = len(self.bpe_tokenizer) +@runtime_checkable +class BaseVocab(Protocol): + tokenizer_model: ClassVar[str] + name: ClassVar[str] + + +class NoVocab(BaseVocab): + tokenizer_model = "no_vocab" + name = "no_vocab" + + def __repr__(self) -> str: + return "" + + +@runtime_checkable +class Vocab(BaseVocab, Protocol): + vocab_size: int + added_tokens_dict: dict[str, int] + added_tokens_list: list[str] + fname_tokenizer: Path + + def __init__(self, base_path: Path): ... + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... + + +class BpeVocab(Vocab): + tokenizer_model = "gpt2" + name = "bpe" + + def __init__(self, base_path: Path): + added_tokens: dict[str, int] = {} + + if (fname_tokenizer := base_path / 'vocab.json').exists(): + # "slow" tokenizer + with open(fname_tokenizer, encoding="utf-8") as f: + self.vocab = json.load(f) + + try: + # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. + with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: + added_tokens = json.load(f) + except FileNotFoundError: + pass + else: + # "fast" tokenizer + fname_tokenizer = base_path / FAST_TOKENIZER_FILE + + # if this fails, FileNotFoundError propagates to caller + with open(fname_tokenizer, encoding="utf-8") as f: + tokenizer_json = json.load(f) + + tokenizer_model: dict[str, Any] = tokenizer_json['model'] + if ( + tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'ByteLevel' + ): + raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') + + self.vocab = tokenizer_model["vocab"] + + if (added := tokenizer_json.get('added_tokens')) is not None: + # Added tokens here can be duplicates of the main vocabulary. + added_tokens = {item['content']: item['id'] + for item in added + if item['content'] not in self.vocab} + + vocab_size = len(self.vocab) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) + actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: expected_end_id = vocab_size + len(actual_ids) - 1 - raise Exception( - f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}" - ) + raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " + f"{vocab_size} - {expected_end_id}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens + self.added_tokens_dict = added_tokens + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: - tokenizer = self.bpe_tokenizer - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} + reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} - for i, _ in enumerate(tokenizer): + for i, _ in enumerate(self.vocab): yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: @@ -441,42 +441,45 @@ class BpeVocab: # GPT return f"" -class SentencePieceVocab: # LlaMa - def __init__( - self, fname_tokenizer: Path, fname_added_tokens: Optional[Path] - ) -> None: +class SentencePieceVocab(Vocab): + tokenizer_model = "llama" + name = "spm" + + def __init__(self, base_path: Path): + added_tokens: dict[str, int] = {} + if (fname_tokenizer := base_path / 'tokenizer.model').exists(): + # normal location + try: + with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: + added_tokens = json.load(f) + except FileNotFoundError: + pass + elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): + # not found in alternate location either + raise FileNotFoundError('Cannot find tokenizer.model') + self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) - added_tokens: dict[str, int] - if fname_added_tokens is not None: - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - added_tokens = {} + vocab_size = self.sentencepiece_tokenizer.vocab_size() - vocab_size: int = self.sentencepiece_tokenizer.vocab_size() - - new_tokens = { - id: piece for piece, id in added_tokens.items() if id >= vocab_size - } + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) + actual_new_ids = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: - raise ValueError( - f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}" - ) + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") # Token pieces that were added to the base vocabulary. - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens + self.added_tokens_dict = added_tokens + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for i in range(tokenizer.vocab_size()): piece = tokenizer.id_to_piece(i) - text: bytes = piece.encode("utf-8") + text = piece.encode("utf-8") score: float = tokenizer.get_score(i) toktype = gguf.TokenType.NORMAL @@ -509,25 +512,47 @@ class SentencePieceVocab: # LlaMa return f"" -class HfVocab: - def __init__( - self, - fname_tokenizer: Path, - fname_added_tokens: Optional[Path] = None, - ) -> None: - print("fname_tokenizer:", fname_tokenizer) +class LlamaHfVocab(Vocab): + tokenizer_model = "llama" + name = "hfft" + + def __init__(self, base_path: Path, ignore_nonllama: bool = False): + fname_tokenizer = base_path / FAST_TOKENIZER_FILE + # if this fails, FileNotFoundError propagates to caller + with open(fname_tokenizer, encoding='utf-8') as f: + tokenizer_json = json.load(f) + + # pre-check so we know if we need transformers + tokenizer_model: dict[str, Any] = tokenizer_json['model'] + if ignore_nonllama: + pass # workaround incorrect use of this class for WordPiece + elif ( + tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'Sequence' + ): + raise FileNotFoundError('Cannot find Llama BPE tokenizer') + + try: + from transformers import AutoTokenizer + except ImportError as e: + raise ImportError( + "To use LlamaHfVocab, please install the `transformers` package. " + "You can install it with `pip install transformers`." + ) from e + # Allow the tokenizer to default to slow or fast versions. # Explicitly set tokenizer to use local paths. self.tokenizer = AutoTokenizer.from_pretrained( - fname_tokenizer, - cache_dir=fname_tokenizer, + base_path, + cache_dir=base_path, local_files_only=True, ) + assert self.tokenizer.is_fast # assume tokenizer.json is used # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] self.added_tokens_dict = dict() - self.added_tokens_ids = set() + self.added_tokens_ids = set() # Process added tokens for tok, tokidx in sorted( @@ -548,12 +573,11 @@ class HfVocab: # Set vocabulary sizes self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens - def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = { id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items() } @@ -568,14 +592,16 @@ class HfVocab: # Yield token text, score, and type yield token_text, self.get_token_score(token_id), self.get_token_type( - token_id, self.special_ids # Reuse already stored special IDs + token_id, token_text, self.special_ids # Reuse already stored special IDs ) - def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType: + def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: + # Special case for byte tokens + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + return gguf.TokenType.BYTE + # Determine token type based on whether it's a special token - return ( - gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL - ) + return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL def get_token_score(self, token_id: int) -> float: # Placeholder for actual logic to determine the token's score @@ -585,9 +611,8 @@ class HfVocab: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type(self.specials[text], self.special_ids) + toktype = self.get_token_type(self.specials[text], b'', self.special_ids) score = self.get_token_score(self.specials[text]) - else: toktype = gguf.TokenType.USER_DEFINED score = -1000.0 @@ -602,10 +627,7 @@ class HfVocab: yield from self.added_tokens() def __repr__(self) -> str: - return f"" - - -Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab" + return f"" # @@ -623,7 +645,7 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: .reshape(weights.shape)) -class Tensor(metaclass=ABCMeta): +class Tensor(ABC): data_type: DataType @abstractmethod @@ -645,7 +667,7 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: class UnquantizedTensor(Tensor): - def __init__(self, ndarray: NDArray) -> None: + def __init__(self, ndarray: NDArray): assert isinstance(ndarray, np.ndarray) self.ndarray = ndarray self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] @@ -724,7 +746,7 @@ class ModelPlus: model: LazyModel paths: list[Path] # Where this was read from. format: Literal['ggml', 'torch', 'safetensors', 'none'] - vocab: Vocab | None # For GGML models (which have vocab built in), the vocab. + vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab. def merge_sharded(models: list[LazyModel]) -> LazyModel: @@ -733,7 +755,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel: names = {name: None for model in models for name in model} def convert(name: str) -> LazyTensor: - lazy_tensors: list[LazyTensor] = [model[name] for model in models] + lazy_tensors = [model[name] for model in models] if len(lazy_tensors) == 1: # only one file; don't go through this procedure since there might # be quantized tensors @@ -754,7 +776,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel: def load() -> UnquantizedTensor: ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] - concatenated: NDArray = np.concatenate(ndarrays, axis=axis) + concatenated = np.concatenate(ndarrays, axis=axis) return UnquantizedTensor(concatenated) description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) @@ -781,7 +803,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: else: model = merge_sharded([mp.model for mp in models_plus]) - return ModelPlus(model, paths, format, vocab) + return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor: @@ -806,6 +828,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description) +def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor: + def load() -> Tensor: + tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors] + return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors])) + s = lazy_tensors[0].shape.copy() + s.insert(0, len(lazy_tensors)) + return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors)) + + # Functionality that simulates `torch.load` but where individual tensors are # only loaded into memory on demand, not all at once. # PyTorch can't do this natively as of time of writing: @@ -842,10 +873,10 @@ class LazyUnpickler(pickle.Unpickler): def load(offset: int, elm_count: int) -> NDArray: dtype = data_type.dtype - fp = self.zip_file.open(info) - fp.seek(offset * dtype.itemsize) - size = elm_count * dtype.itemsize - data = fp.read(size) + with self.zip_file.open(info) as fp: + fp.seek(offset * dtype.itemsize) + size = elm_count * dtype.itemsize + data = fp.read(size) assert len(data) == size return np.frombuffer(data, dtype) description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' @@ -866,20 +897,16 @@ class LazyUnpickler(pickle.Unpickler): def rebuild_from_type_v2(func, new_type, args, state): return func(*args) - CLASSES: dict[tuple[str, str], Any] = { + CLASSES = { # getattr used here as a workaround for mypy not being smart enough to determine # the staticmethods have a __func__ attribute. - ("torch._tensor", "_rebuild_from_type_v2"): getattr( - rebuild_from_type_v2, "__func__" - ), - ("torch._utils", "_rebuild_tensor_v2"): getattr( - lazy_rebuild_tensor_v2, "__func__" - ), - ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16), - ("torch", "HalfStorage"): LazyStorageKind(DT_F16), - ("torch", "FloatStorage"): LazyStorageKind(DT_F32), - ("torch", "IntStorage"): LazyStorageKind(DT_I32), - ("torch", "Tensor"): LazyTensor, + ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), + ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), + ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), + ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), + ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), + ('torch', 'IntStorage'): LazyStorageKind(DT_I32), + ('torch', 'Tensor'): LazyTensor, } def find_class(self, module: str, name: str) -> Any: @@ -929,7 +956,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: def must_read(fp: IO[bytes], length: int) -> bytes: ret = fp.read(length) if len(ret) < length: - raise Exception("unexpectedly reached end of file") + raise EOFError("unexpectedly reached end of file") return ret @@ -966,7 +993,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc executor_class = ProcessPoolExecutor else: executor_class = ThreadPoolExecutor - with executor_class(max_workers = max_workers) as executor: + with executor_class(max_workers=max_workers) as executor: futures: list[concurrent.futures.Future[Out]] = [] done = False for _ in range(concurrency): @@ -987,12 +1014,15 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc yield result -def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: +def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None: # Handle special case where the model's vocab size is not set if params.n_vocab == -1: raise ValueError( - f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?" + "The model's vocab size is set to -1 in params.json. Please update it manually." + + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""), ) + if not isinstance(vocab, Vocab): + return # model has no vocab # Check for a vocab size mismatch if params.n_vocab == vocab.vocab_size: @@ -1006,6 +1036,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N ) for i in range(1, pad_count + 1): vocab.added_tokens_dict[f""] = -1 + vocab.added_tokens_list.append(f"") vocab.vocab_size = params.n_vocab return @@ -1015,16 +1046,12 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N if vocab.vocab_size < params.n_vocab: msg += " Add the --pad-vocab option and try again." - raise Exception(msg) + raise ValueError(msg) class OutputFile: - def __init__( - self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE - ) -> None: - self.gguf = gguf.GGUFWriter( - fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess - ) + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): + self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: name = "LLaMA" @@ -1035,19 +1062,15 @@ class OutputFile: elif params.path_model is not None: name = params.path_model.name - self.gguf.add_name(name) - self.gguf.add_context_length(params.n_ctx) - self.gguf.add_embedding_length(params.n_embd) - self.gguf.add_block_count(params.n_layer) - self.gguf.add_feed_forward_length(params.n_ff) + self.gguf.add_name (name) + self.gguf.add_vocab_size (params.n_vocab) + self.gguf.add_context_length (params.n_ctx) + self.gguf.add_embedding_length (params.n_embd) + self.gguf.add_block_count (params.n_layer) + self.gguf.add_feed_forward_length (params.n_ff) self.gguf.add_rope_dimension_count(params.n_embd // params.n_head) - self.gguf.add_head_count(params.n_head) - self.gguf.add_head_count_kv(params.n_head_kv) - - if params.f_norm_eps is None: - raise ValueError("f_norm_eps is None") - - self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + self.gguf.add_head_count (params.n_head) + self.gguf.add_head_count_kv (params.n_head_kv) if params.n_experts: self.gguf.add_expert_count(params.n_experts) @@ -1055,6 +1078,11 @@ class OutputFile: if params.n_experts_used: self.gguf.add_expert_used_count(params.n_experts_used) + if params.f_norm_eps: + self.gguf.add_layer_norm_rms_eps(params.f_norm_eps) + else: + raise ValueError('f_norm_eps is None') + if params.f_rope_freq_base is not None: self.gguf.add_rope_freq_base(params.f_rope_freq_base) @@ -1072,21 +1100,7 @@ class OutputFile: if params.ftype is not None: self.gguf.add_file_type(params.ftype) - def handle_tokenizer_model(self, vocab: Vocab) -> str: - # Map the vocab types to the supported tokenizer models - tokenizer_model = { - SentencePieceVocab: "llama", - HfVocab: "llama", - BpeVocab: "gpt2", - }.get(type(vocab)) - - # Block if vocab type is not predefined - if tokenizer_model is None: - raise ValueError("Unknown vocab type: Not supported") - - return tokenizer_model - - def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]: + def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: tokens = [] scores = [] toktypes = [] @@ -1097,14 +1111,13 @@ class OutputFile: scores.append(score) toktypes.append(toktype) + assert len(tokens) == vocab.vocab_size + return tokens, scores, toktypes def add_meta_vocab(self, vocab: Vocab) -> None: - # Handle the tokenizer model - tokenizer_model = self.handle_tokenizer_model(vocab) - # Ensure that tokenizer_model is added to the GGUF model - self.gguf.add_tokenizer_model(tokenizer_model) + self.gguf.add_tokenizer_model(vocab.tokenizer_model) # Extract model vocabulary for model conversion tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) @@ -1119,14 +1132,10 @@ class OutputFile: def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: n_elements = int(np.prod(tensor.shape)) - raw_dtype = getattr(tensor.data_type, "ggml_type", None) - data_type = ( - getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype - ) + raw_dtype = getattr(tensor.data_type, 'ggml_type', None) + data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype data_nbytes = tensor.data_type.elements_to_bytes(n_elements) - self.gguf.add_tensor_info( - name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype - ) + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype) def write_meta(self) -> None: self.gguf.write_header_to_file() @@ -1135,17 +1144,33 @@ class OutputFile: def write_tensor_info(self) -> None: self.gguf.write_ti_data_to_file() + def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None: + ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency) + if ftype == GGMLFileType.MostlyQ8_0: + ndarrays = bounded_parallel_map( + OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency, + use_processpool_executor=True, + ) + else: + ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) + + start = time.time() + for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): + elapsed = time.time() - start + size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) + padi = len(str(len(model))) + print( + f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" + ) + self.gguf.write_tensor_data(ndarray) + def close(self) -> None: self.gguf.close() @staticmethod def write_vocab_only( - fname_out: Path, - params: Params, - vocab: Vocab, - svocab: gguf.SpecialVocab, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, - pad_vocab: bool = False, + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1175,14 +1200,8 @@ class OutputFile: @staticmethod def write_all( - fname_out: Path, - ftype: GGMLFileType, - params: Params, - model: LazyModel, - vocab: Vocab, - svocab: gguf.SpecialVocab, - concurrency: int = DEFAULT_CONCURRENCY, - endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: check_vocab_size(params, vocab, pad_vocab=pad_vocab) @@ -1191,8 +1210,11 @@ class OutputFile: # meta data of.add_meta_arch(params) - of.add_meta_vocab(vocab) - of.add_meta_special_vocab(svocab) + if isinstance(vocab, Vocab): + of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) + else: # NoVocab + of.gguf.add_tokenizer_model(vocab.tokenizer_model) # tensor info for name, lazy_tensor in model.items(): @@ -1202,31 +1224,7 @@ class OutputFile: of.write_tensor_info() # tensor data - ndarrays_inner = bounded_parallel_map( - OutputFile.do_item, model.items(), concurrency=concurrency - ) - if ftype == GGMLFileType.MostlyQ8_0: - ndarrays = bounded_parallel_map( - OutputFile.maybe_do_quantize, - ndarrays_inner, - concurrency=concurrency, - max_workers=concurrency, - use_processpool_executor=True, - ) - else: - ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) - - start = time.time() - for i, ((name, lazy_tensor), ndarray) in enumerate( - zip(model.items(), ndarrays) - ): - elapsed = time.time() - start - size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape) - padi = len(str(len(model))) - print( - f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" - ) - of.gguf.write_tensor_data(ndarray) + of.write_tensor_data(ftype, model, concurrency) of.close() @@ -1234,16 +1232,16 @@ class OutputFile: def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type - if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): + if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): return GGMLFileType.AllF32 - if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): + if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): return GGMLFileType.MostlyF16 if output_type_str == "q8_0": return GGMLFileType.MostlyQ8_0 name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} - raise Exception(f"Unexpected combination of types: {name_to_type}") + raise ValueError(f"Unexpected combination of types: {name_to_type}") def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: @@ -1251,12 +1249,28 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM for (name, tensor) in model.items()} -def convert_model_names(model: LazyModel, params: Params) -> LazyModel: +def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel: tmap = gguf.TensorNameMap(ARCH, params.n_layer) - should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) + should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) tmp = model + # merge experts into one tensor + if params.n_experts and params.n_experts > 0: + for i_l in range(params.n_layer): + for w in range(1, 4): + experts = [] + for e in range(params.n_experts): + if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model: + experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]) + del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"] + elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model: + experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]) + del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"] + else: + raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight") + tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts) + # HF models permut or pack some of the tensors, so we need to undo that for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" in model: @@ -1277,7 +1291,10 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: for name, lazy_tensor in model.items(): tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) if name_new is None: - raise Exception(f"Unexpected tensor name: {name}") + if skip_unknown: + print(f"Unexpected tensor name: {name} - skipping") + continue + raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") if tensor_type in should_skip: print(f"skipping tensor {name_new}") @@ -1294,7 +1311,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None: the nth path in the model. ''' # Support the following patterns: - patterns: list[tuple[str, str]] = [ + patterns = [ # - x.00.pth, x.01.pth, etc. (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. @@ -1340,9 +1357,9 @@ def load_some_model(path: Path) -> ModelPlus: globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] files = [file for glob in globs for file in path.glob(glob)] if not files: - raise Exception(f"Can't find model in directory {path}") + raise FileNotFoundError(f"Can't find model in directory {path}") if len(files) > 1: - raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}") + raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}") path = files[0] paths = find_multifile_paths(path) @@ -1356,46 +1373,14 @@ def load_some_model(path: Path) -> ModelPlus: class VocabFactory: + _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab] + def __init__(self, path: Path): self.path = path - self.files = { - "tokenizer.model": None, - "vocab.json": None, - "tokenizer.json": None, - } - self._detect_files() - def _detect_files(self): - for file in self.files.keys(): - file_path = self.path / file - parent_file_path = self.path.parent / file - if file_path.exists(): - self.files[file] = file_path - elif parent_file_path.exists(): - self.files[file] = parent_file_path - - def _select_file(self, vocabtype: Optional[str]) -> Path: - if vocabtype in ["spm", "bpe"]: - # For SentencePiece and BPE, return specific files as before - file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json" - if self.files[file_key]: - return self.files[file_key] - else: - raise FileNotFoundError(f"{vocabtype} {file_key} not found.") - elif vocabtype == "hfft": - # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file - return self.path - else: - raise ValueError(f"Unsupported vocabulary type {vocabtype}") - - def _create_special_vocab( - self, - vocab: Vocab, - vocabtype: str, - model_parent_path: Path, - ) -> gguf.SpecialVocab: - load_merges = vocabtype == "bpe" - n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None + def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab: + load_merges = vocab.name == "bpe" + n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None return gguf.SpecialVocab( model_parent_path, load_merges=load_merges, @@ -1403,47 +1388,52 @@ class VocabFactory: n_vocab=n_vocab, ) - def load_vocab( - self, vocabtype: str, model_parent_path: Path - ) -> Tuple[Vocab, gguf.SpecialVocab]: - path = self._select_file(vocabtype) - print(f"Loading vocab file '{path}', type '{vocabtype}'") + def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab: + vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES} + selected_vocabs: dict[str, type[Vocab]] = {} + for vtype in vocab_types: + try: + selected_vocabs[vtype] = vocab_classes[vtype] + except KeyError: + raise ValueError(f"Unsupported vocabulary type {vtype}") from None - added_tokens_path = path.parent / "added_tokens.json" - if vocabtype == "bpe": - vocab = BpeVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - elif vocabtype == "spm": - vocab = SentencePieceVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - elif vocabtype == "hfft": - vocab = HfVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) + for vtype, cls in selected_vocabs.items(): + try: + vocab = cls(self.path) + break + except FileNotFoundError: + pass # ignore unavailable tokenizers else: - raise ValueError(f"Unsupported vocabulary type {vocabtype}") + raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}") + + print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}") + return vocab + + def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]: + vocab: BaseVocab + if vocab_types is None: + vocab = NoVocab() + else: + vocab = self._create_vocab_by_path(vocab_types) + # FIXME: Respect --vocab-dir? special_vocab = self._create_special_vocab( vocab, - vocabtype, model_parent_path, ) return vocab, special_vocab -def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path: +def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: namestr = { - GGMLFileType.AllF32: "f32", + GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", - GGMLFileType.MostlyQ8_0: "q8_0", + GGMLFileType.MostlyQ8_0:"q8_0", }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: sys.stderr.write( f"Error: Default output path ({ret}) would overwrite the input. " - "Please explicitly specify a path using --outfile.\n" - ) + "Please explicitly specify a path using --outfile.\n") sys.exit(1) return ret @@ -1453,120 +1443,33 @@ def do_dump_model(model_plus: ModelPlus) -> None: print(f"model_plus.format = {model_plus.format!r}") print(f"model_plus.vocab = {model_plus.vocab!r}") for name, lazy_tensor in model_plus.model.items(): - print( - f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}" - ) + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") -def get_argument_parser() -> ArgumentParser: +def main(args_in: list[str] | None = None) -> None: output_choices = ["f32", "f16"] if np.uint32(1) == np.uint32(1).newbyteorder("<"): # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") + parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file") + parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") + parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab") + parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") + parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") + parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") + parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") + parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY) + parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") + parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") - parser = argparse.ArgumentParser( - description="Convert a LLaMa model to a GGML compatible file" - ) - - parser.add_argument( - "model", - type=Path, - help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)", - ) - - parser.add_argument( - "--awq-path", - type=Path, - help="Path to the Activation-aware Weight Quantization cache file", - default=None, - ) - - parser.add_argument( - "--dump", - action="store_true", - help="Display the model content without converting it", - ) - - parser.add_argument( - "--dump-single", - action="store_true", - help="Display the content of a single model file without conversion", - ) - - parser.add_argument( - "--vocab-only", - action="store_true", - help="Extract and output only the vocabulary", - ) - - parser.add_argument( - "--outtype", - choices=output_choices, - help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)", - ) - - parser.add_argument( - "--vocab-dir", - type=Path, - help="Directory containing the tokenizer.model, if separate from the model file", - ) - - parser.add_argument( - "--vocab-type", - choices=["spm", "bpe", "hfft"], # hfft: Hugging Face Fast Tokenizer - default="spm", - help="The vocabulary format used to define the tokenizer model (default: spm)", - ) - - parser.add_argument( - "--pad-vocab", - action="store_true", - help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata", - ) - - parser.add_argument( - "--outfile", - type=Path, - help="Specify the path for the output file (default is based on input)", - ) - - parser.add_argument( - "--ctx", type=int, help="Model training context (default is based on input)" - ) - - parser.add_argument( - "--concurrency", - type=int, - help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", - default=DEFAULT_CONCURRENCY, - ) - - parser.add_argument( - "--big-endian", - action="store_true", - help="Indicate that the model is executed on a big-endian machine", - ) - - return parser - - -def main(argv: Optional[list[str]] = None) -> None: - parser = get_argument_parser() - args = parser.parse_args(argv) - - if args.awq_path: - sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py")) - from awq.apply_awq import add_scale_weights - - tmp_model_path = args.model / "weighted_model" - if tmp_model_path.is_dir(): - print(f"{tmp_model_path} exists as a weighted model.") - else: - tmp_model_path.mkdir(parents=True, exist_ok=True) - print("Saving new weighted model ...") - add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - print(f"Saved weighted model at {tmp_model_path}.") - args.model = tmp_model_path + args = parser.parse_args(args_in) + if args.no_vocab and args.vocab_only: + raise ValueError("--vocab-only does not make sense with --no-vocab") if args.dump_single: model_plus = lazy_load_file(args.model) @@ -1576,14 +1479,11 @@ def main(argv: Optional[list[str]] = None) -> None: if not args.vocab_only: model_plus = load_some_model(args.model) else: - model_plus = ModelPlus( - model={}, paths=[args.model / "dummy"], format="none", vocab=None - ) + model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) if args.dump: do_dump_model(model_plus) return - endianess = gguf.GGUFEndian.LITTLE if args.big_endian: endianess = gguf.GGUFEndian.BIG @@ -1591,12 +1491,12 @@ def main(argv: Optional[list[str]] = None) -> None: params = Params.load(model_plus) if params.n_ctx == -1: if args.ctx is None: - raise Exception( - "The model doesn't have a context size, and you didn't specify one with --ctx\n" - "Please specify one with --ctx:\n" - " - LLaMA v1: --ctx 2048\n" - " - LLaMA v2: --ctx 4096\n" - ) + msg = """\ + The model doesn't have a context size, and you didn't specify one with --ctx + Please specify one with --ctx: + - LLaMA v1: --ctx 2048 + - LLaMA v2: --ctx 4096""" + parser.error(textwrap.dedent(msg)) params.n_ctx = args.ctx if args.outtype: @@ -1611,48 +1511,38 @@ def main(argv: Optional[list[str]] = None) -> None: model_parent_path = model_plus.paths[0].parent vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_factory = VocabFactory(vocab_path) - vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path) + vocab_types = None if args.no_vocab else args.vocab_type.split(",") + vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path) if args.vocab_only: + assert isinstance(vocab, Vocab) if not args.outfile: raise ValueError("need --outfile if using --vocab-only") outfile = args.outfile - OutputFile.write_vocab_only( - outfile, - params, - vocab, - special_vocab, - endianess=endianess, - pad_vocab=args.pad_vocab, - ) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, + endianess=endianess, pad_vocab=args.pad_vocab) print(f"Wrote {outfile}") return - if model_plus.vocab is not None and args.vocab_dir is None: + if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: vocab = model_plus.vocab - model = model_plus.model - model = convert_model_names(model, params) - ftype = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_output_file(model_plus.paths, ftype) + print(f"Vocab info: {vocab}") + print(f"Special vocab info: {special_vocab}") + + model = model_plus.model + model = convert_model_names(model, params, args.skip_unknown) + ftype = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, ftype) + outfile = args.outfile or default_outfile(model_plus.paths, ftype) params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all( - outfile, - ftype, - params, - model, - vocab, - special_vocab, - concurrency=args.concurrency, - endianess=endianess, - pad_vocab=args.pad_vocab, - ) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) print(f"Wrote {outfile}") -if __name__ == "__main__": - main(sys.argv[1:]) # Exclude the first element (script name) from sys.argv +if __name__ == '__main__': + main() diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md index d7e863dff..3c4343147 100644 --- a/docs/token_generation_performance_tips.md +++ b/docs/token_generation_performance_tips.md @@ -1,7 +1,7 @@ # Token generation performance troubleshooting -## Verifying that the model is running on the GPU with cuBLAS -Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: +## Verifying that the model is running on the GPU with CUDA +Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0c71cbdf7..76496bf06 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,25 +20,30 @@ else() add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(finetune) + add_subdirectory(gritlm) + add_subdirectory(gguf-split) add_subdirectory(infill) add_subdirectory(llama-bench) add_subdirectory(llava) + if (LLAMA_SYCL) + add_subdirectory(sycl) + endif() add_subdirectory(main) add_subdirectory(tokenize) add_subdirectory(parallel) add_subdirectory(perplexity) add_subdirectory(quantize) add_subdirectory(quantize-stats) + add_subdirectory(retrieval) add_subdirectory(save-load-state) add_subdirectory(simple) add_subdirectory(passkey) add_subdirectory(speculative) add_subdirectory(lookahead) add_subdirectory(lookup) + add_subdirectory(gguf) add_subdirectory(train-text-from-scratch) - if (LLAMA_METAL) - add_subdirectory(metal) - endif() + add_subdirectory(imatrix) if (LLAMA_BUILD_SERVER) add_subdirectory(server) endif() diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index e7d2ad592..bf0125e75 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1533,27 +1533,28 @@ int main(int argc, char ** argv) { int n_past = 0; - ggml_cgraph gf = {}; + struct ggml_cgraph * gf = NULL; + gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets); - struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch); + struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch); // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits); struct ggml_tensor * e = square_error_loss(ctx0, targets, logits); - ggml_build_forward_expand(&gf, e); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, e); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); + struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS); opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; opt_params_lbfgs.lbfgs.n_iter = 16; ggml_opt(ctx0, opt_params_lbfgs, e); // - ggml_build_forward_expand(&gf, e); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, e); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1600,13 +1601,14 @@ int main(int argc, char ** argv) { }; struct ggml_context * ctx0 = ggml_init(params); - ggml_cgraph gf = {}; + struct ggml_cgraph * gf = NULL; + gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true); int n_past = 0; - struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); + struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past); - ggml_build_forward_expand(&gf, logits); - ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, logits); + ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1); struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md index 34b343f66..bf951baf7 100644 --- a/examples/batched-bench/README.md +++ b/examples/batched-bench/README.md @@ -10,16 +10,16 @@ There are 2 modes of operation: - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) ```bash -./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] +./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared -./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99 +./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared -./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99 +./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99 # custom set of batches -./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32 +./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32 ``` ## Sample results diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 57596ed98..1e34de620 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -32,16 +32,17 @@ int main(int argc, char ** argv) { gpt_params params; if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] \n" , argv[0]); + printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] \n" , argv[0]); printf(" , and PL are comma-separated lists of numbers without spaces\n\n"); - printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); + printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); return 1 ; } int n_kv_max = 2048; + int n_batch = 2048; + int n_ubatch = 512; int is_pp_shared = 0; int n_gpu_layers = 0; - int mmq = 0; std::vector n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, }; std::vector n_tg = { 128, 256, }; @@ -57,38 +58,46 @@ int main(int argc, char ** argv) { } if (argc >= 4) { - is_pp_shared = std::atoi(argv[3]); + n_batch = std::atoi(argv[3]); } if (argc >= 5) { - n_gpu_layers = std::atoi(argv[4]); + n_ubatch = std::atoi(argv[4]); } if (argc >= 6) { - mmq = std::atoi(argv[5]); + is_pp_shared = std::atoi(argv[5]); } if (argc >= 7) { - n_pp = parse_list(argv[6]); + n_gpu_layers = std::atoi(argv[6]); } if (argc >= 8) { - n_tg = parse_list(argv[7]); + n_pp = parse_list(argv[7]); } if (argc >= 9) { - n_pl = parse_list(argv[8]); + n_tg = parse_list(argv[8]); + } + + if (argc >= 10) { + n_pl = parse_list(argv[9]); } // init LLM - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // initialize the model llama_model_params model_params = llama_model_default_params(); + const std::vector t_split(llama_max_devices(), 0.0f); + model_params.n_gpu_layers = n_gpu_layers; + model_params.tensor_split = t_split.data(); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -101,12 +110,15 @@ int main(int argc, char ** argv) { ctx_params.seed = 1234; ctx_params.n_ctx = n_kv_max; - ctx_params.n_batch = 512; - ctx_params.mul_mat_q = mmq; + ctx_params.n_batch = n_batch; + ctx_params.n_ubatch = n_ubatch; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + // ensure enough sequences are available + ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end()); + llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { @@ -137,6 +149,8 @@ int main(int argc, char ** argv) { LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); return false; } + + llama_synchronize(ctx); } return true; @@ -155,7 +169,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("\n"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); @@ -176,10 +190,10 @@ int main(int argc, char ** argv) { llama_batch_clear(batch); - const int n_tokens = is_pp_shared ? pp : pl*pp; - - for (int i = 0; i < n_tokens; ++i) { - llama_batch_add(batch, 0, i, { 0 }, false); + for (int i = 0; i < pp; ++i) { + for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { + llama_batch_add(batch, 0, i, { j }, false); + } } batch.logits[batch.n_tokens - 1] = true; @@ -194,7 +208,7 @@ int main(int argc, char ** argv) { if (is_pp_shared) { for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, pp); + llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } } diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 4d0005349..d75c503d5 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -17,7 +17,7 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu let n_len: Int = 32 // init LLM -llama_backend_init(false) +llama_backend_init() defer { llama_backend_free() } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index b1775e0b0..7aaf63ceb 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -48,9 +48,12 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; } + process_escapes(params.prompt); + // init LLM - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // initialize the model @@ -77,8 +80,9 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.seed = 1234; - ctx_params.n_ctx = n_kv_req; + ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_len, n_parallel); + ctx_params.n_seq_max = n_parallel; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -91,7 +95,7 @@ int main(int argc, char ** argv) { const int n_ctx = llama_n_ctx(ctx); - LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); + LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req); // make sure the KV cache is big enough to hold all the prompt and generated tokens if (n_kv_req > n_ctx) { @@ -131,7 +135,7 @@ int main(int argc, char ** argv) { // assign the system KV cache to all parallel sequences // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them for (int32_t i = 1; i < n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens); + llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } if (n_parallel > 1) { diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 679b382e1..866c6d7a6 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -119,7 +119,8 @@ int main(int argc, char ** argv) // Init LLM : //--------------------------------- - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 434e1d6bd..47cb16c69 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -189,12 +189,10 @@ int main(int argc, char ** argv) { int32_t nelements = sizex*sizey; - std::vector hist_cur(1 << 4, 0); - // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data()); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr); // Set up a the compute graph // printf("Creating new tensor q31\n"); @@ -207,7 +205,7 @@ int main(int argc, char ** argv) { // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data()); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index 0f37d295b..6da2d7e18 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -21,6 +21,8 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface. `$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` +Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K). + Now you can use the model with a command like: `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 4d41e1779..746c3fbef 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -1,6 +1,7 @@ #include "ggml.h" #include "llama.h" #include "common.h" +#include "log.h" #include #include @@ -78,111 +79,101 @@ typedef struct { struct TransformerWeights { // token embedding table - float* token_embedding_table; // (vocab_size, dim) + std::vector token_embedding_table; // (vocab_size, dim) // weights for rmsnorms - float* rms_att_weight; // (layer, dim) rmsnorm weights - float* rms_ffn_weight; // (layer, dim) + std::vector rms_att_weight; // (layer, dim) rmsnorm weights + std::vector rms_ffn_weight; // (layer, dim) // weights for matmuls - float* wq; // (layer, dim, dim) - float* wk; // (layer, dim, dim) - float* wv; // (layer, dim, dim) - float* wo; // (layer, dim, dim) + std::vector wq; // (layer, dim, dim) + std::vector wk; // (layer, dim, dim) + std::vector wv; // (layer, dim, dim) + std::vector wo; // (layer, dim, dim) // weights for ffn - float* w1; // (layer, hidden_dim, dim) - float* w2; // (layer, dim, hidden_dim) - float* w3; // (layer, hidden_dim, dim) + std::vector w1; // (layer, hidden_dim, dim) + std::vector w2; // (layer, dim, hidden_dim) + std::vector w3; // (layer, hidden_dim, dim) // final rmsnorm - float* rms_final_weight; // (dim,) + std::vector rms_final_weight; // (dim,) // freq_cis for RoPE relatively positional embeddings - // float* freq_cis_real; // (seq_len, dim/2) - // float* freq_cis_imag; // (seq_len, dim/2) + // std::vector freq_cis_real; // (seq_len, dim/2) + // std::vector freq_cis_imag; // (seq_len, dim/2) // (optional) classifier weights for the logits, on the last layer - float* wcls; - - ~TransformerWeights() { - delete[] token_embedding_table; - delete[] rms_att_weight; - delete[] rms_ffn_weight; - delete[] wq; - delete[] wk; - delete[] wv; - delete[] wo; - delete[] w1; - delete[] w2; - delete[] w3; - delete[] rms_final_weight; - delete[] wcls; - } + std::vector wcls; }; -static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { - // we calloc instead of malloc to keep valgrind happy - w->token_embedding_table = new float[p->vocab_size * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); +static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) { + const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; + try { + w->token_embedding_table.resize(p->vocab_size * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); - w->rms_att_weight = new float[p->n_layers * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); + w->rms_att_weight.resize(p->n_layers * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); - w->rms_ffn_weight = new float[p->n_layers * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); + w->rms_ffn_weight.resize(p->n_layers * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); - w->wq = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wq.resize(p->n_layers * p->dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); - w->wk = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); - w->wv = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); - w->wo = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wo.resize(p->n_layers * p->dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); - w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + w->w1.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); - w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); + w->w2.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); - w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + w->w3.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); - w->rms_final_weight = new float[p->dim](); - printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); + w->rms_final_weight.resize(p->dim); + LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); - if (shared_weights) { - w->wcls = NULL; - } else { - w->wcls = new float[p->vocab_size * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + if (shared_weights) { + w->wcls = {}; + } else { + w->wcls.resize(p->vocab_size * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + } + } + catch (std::length_error &) { + die("Invalid configuration. Failed to allocate memory for weights"); } } -static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { - if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; - if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; - if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; - if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1; - if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast(p->n_layers * p->hidden_dim * p->dim)) return 1; - if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1; - if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast(p->dim)) return 1; +static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) { + if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1; + if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1; + if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1; + if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1; + if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1; + if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1; + if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1; + if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1; + if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1; + if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1; + if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1; // Skip freq_cis_real & freq_cis_imag int head_size = p->dim / p->n_heads; fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR); - if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; + if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1; // Check we didn't forget to read anything auto curr = ftell(f); fseek(f, 0, SEEK_END); auto end = ftell(f); if (curr != end) { - printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end); + LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); return 1; } @@ -190,20 +181,20 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo } static void print_sample_weights(TransformerWeights *w){ - printf("----- Quick print of first of the weight vales of all the variables\n"); - printf("%f\n", w->token_embedding_table[0]); - printf("%f\n", w->rms_att_weight[0]); - printf("%f\n", w->rms_ffn_weight[0]); + LOG("----- Quick print of first of the weight vales of all the variables\n"); + LOG("%f\n", w->token_embedding_table[0]); + LOG("%f\n", w->rms_att_weight[0]); + LOG("%f\n", w->rms_ffn_weight[0]); - printf("%f\n", w->wq[0]); - printf("%f\n", w->wk[0]); - printf("%f\n", w->wv[0]); - printf("%f\n", w->wo[0]); - printf("%f\n", w->w1[0]); - printf("%f\n", w->w2[0]); - printf("%f\n", w->w3[0]); - printf("%f\n", w->rms_att_weight[0]); - if (w->wcls) printf("%f\n", w->wcls[0]); + LOG("%f\n", w->wq[0]); + LOG("%f\n", w->wk[0]); + LOG("%f\n", w->wv[0]); + LOG("%f\n", w->wo[0]); + LOG("%f\n", w->w1[0]); + LOG("%f\n", w->w2[0]); + LOG("%f\n", w->w3[0]); + LOG("%f\n", w->rms_att_weight[0]); + if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -225,14 +216,16 @@ struct llama_vocab { }; struct my_llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_ff = 11008; - uint32_t n_mult = 4; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_ff = 11008; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_head_kv = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + bool operator!=(const my_llama_hparams& other) const { return memcmp(this, &other, sizeof(my_llama_hparams)); } @@ -325,14 +318,30 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %d\n", __func__, params->n_vocab); - printf("%s: n_ctx: %d\n", __func__, params->n_ctx); - printf("%s: n_embd: %d\n", __func__, params->n_embd); - printf("%s: n_mult: %d\n", __func__, params->n_mult); - printf("%s: n_head: %d\n", __func__, params->n_head); - printf("%s: n_ff: %d\n", __func__, params->n_ff); - printf("%s: n_layer: %d\n", __func__, params->n_layer); - printf("%s: n_rot: %d\n", __func__, params->n_rot); + LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); + LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); + LOG("%s: n_embd: %u\n", __func__, params->n_embd); + LOG("%s: n_mult: %u\n", __func__, params->n_mult); + LOG("%s: n_head: %u\n", __func__, params->n_head); + LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); + LOG("%s: n_ff: %u\n", __func__, params->n_ff); + LOG("%s: n_layer: %u\n", __func__, params->n_layer); + LOG("%s: n_rot: %u\n", __func__, params->n_rot); +} + +static void print_tensor_info(const struct ggml_context * ctx) { + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + LOG("%s: Allocating ", __func__); + int64_t total = 1; + int i = 0; + for (; i < ggml_n_dims(t); ++i) { + if (i > 0) LOG("x "); + LOG("[%" PRId64 "] ", t->ne[i]); + total *= t->ne[i]; + } + if (i > 1) LOG("= [%" PRId64 "] ", total); + LOG("float space for %s\n", ggml_get_name(t)); + } } static void init_model(struct my_llama_model * model) { @@ -342,6 +351,8 @@ static void init_model(struct my_llama_model * model) { const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; + const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv; + const uint32_t n_ff = hparams.n_ff; struct ggml_context * ctx = model->ctx; @@ -350,25 +361,8 @@ static void init_model(struct my_llama_model * model) { model->train_tokens = 0; model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); - model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd); - model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); - - // printing the per-layer allocations here so we dont print in the for loop. - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - - printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer); - - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); - printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); ggml_set_name(model->norm, "norm.weight"); @@ -383,8 +377,8 @@ static void init_model(struct my_llama_model * model) { layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries); layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); @@ -406,6 +400,8 @@ static void init_model(struct my_llama_model * model) { ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str()); ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str()); } + + print_tensor_info(ctx); } static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { @@ -421,9 +417,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); - printf(" %f", p); + LOG(" %f", p); } - printf("\n"); + LOG("\n"); } static void print_matrix(struct ggml_tensor * probs) { @@ -431,33 +427,12 @@ static void print_matrix(struct ggml_tensor * probs) { for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); - printf(" %.2f", p); + LOG(" %.2f", p); } - printf("\n"); + LOG("\n"); } } -#ifdef __GNUC__ -#ifdef __MINGW32__ -__attribute__((format(gnu_printf, 1, 2))) -#else -__attribute__((format(printf, 1, 2))) -#endif -#endif -static std::string format(const char * fmt, ...) { - va_list ap, ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - struct llama_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; @@ -549,8 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) { return out.str(); } -static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { +static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { if (is_ggml_file(filename)) { + LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -578,6 +554,9 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + if (n_vocab != static_cast(config->vocab_size)) { + die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size); + } vocab->id_to_token.resize(n_vocab); @@ -595,7 +574,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab gguf_free(ctx); } else { // assume llama2.c vocabulary - printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); + LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); llama_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); @@ -638,38 +617,15 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab } static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { - int ct; - switch (ggml_n_dims(gg_weights)) { - case 1: - ct = 0; - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]); - *ptr = karpathy_weights[ct]; - ct++; - } - break; - case 2: - ct = 0; - for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) { - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) { - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]); - *ptr = karpathy_weights[ct]; - ct++; - } - } - break; - case 3: - ct = 0; - for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) { - for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) { - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) { - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]); - *ptr = karpathy_weights[ct]; - ct++; - } - } - } - break; + int size = 1; + for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) { + size *= gg_weights->ne[dim]; + } + for (int ct = 0; ct < size; ++ct) { + int64_t i0 = 0; int64_t i1 = 0; + int64_t i2 = 0; int64_t i3 = 0; + ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3); + ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]); } } @@ -679,16 +635,18 @@ static void save_as_llama_model( // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor - convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table); - convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); + convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data()); + convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data()); - convert_weights_ak_to_gg(model->norm, w->rms_final_weight); + convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data()); //print_row(model->norm, 0); // for rms-att-weight int row_length = model->hparams.n_embd; int n_ff = model->hparams.n_ff; + const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv; + for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; // 1d @@ -697,9 +655,10 @@ static void save_as_llama_model( // from 3d matrix layer x dim x dim to 2d matrix dim x dim convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]); - convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]); - convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]); convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]); + // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries + convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]); + convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]); convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]); convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]); @@ -736,8 +695,8 @@ static void save_as_llama_model( gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); - // n_head_kv is optional, default to n_head - // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv); gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer); gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot); gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); @@ -789,12 +748,12 @@ static void save_as_llama_model( static struct train_params get_default_train_params() { struct train_params params; - params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; + params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_llama2c_output_model = "ak_llama_model.bin"; - params.fn_train_data = "shakespeare.txt"; - params.fn_checkpoint_in = "checkpoint.bin"; - params.fn_checkpoint_out = "checkpoint.bin"; - params.fn_model_out = "ggml-checkpoint-f32.bin"; + params.fn_train_data = "shakespeare.txt"; + params.fn_checkpoint_in = "checkpoint.bin"; + params.fn_checkpoint_out = "checkpoint.bin"; + params.fn_model_out = "ggml-checkpoint-f32.bin"; params.seed = -1; @@ -829,8 +788,8 @@ static struct train_params get_default_train_params() { params.adam_alpha = 1e-3f; params.adam_decay = 1e-3f; - params.mem_model_gb = 2; - params.mem_compute_gb = 24; + params.mem_model_gb = 2; + params.mem_compute_gb = 24; params.mem_compute0_gb = 8; params.mem_compute1_gb = 2; @@ -916,19 +875,30 @@ int main(int argc, char ** argv) { if (!params_parse(argc, argv, ¶ms)) { return 1; } + log_set_target(stdout); Config config; TransformerWeights weights = {}; { - FILE *file = fopen(params.fn_llama2c_model, "rb"); - if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } + LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); + FILE * file = fopen(params.fn_llama2c_model, "rb"); + if (!file) { + LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); + return 1; + } // read in the config header - if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; } + if (fread(&config, sizeof(Config), 1, file) != 1) { + LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); + return 1; + } auto shared_weights = config.vocab_size > 0; config.vocab_size = abs(config.vocab_size); // read in the Transformer weights - malloc_weights(&weights, &config, shared_weights); - if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; } + alloc_weights(&weights, &config, shared_weights); + if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { + LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); + return 1; + } fclose(file); } @@ -936,15 +906,18 @@ int main(int argc, char ** argv) { load_vocab(params.fn_vocab_model, &config, &vocab); struct my_llama_model model; - model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); - model.hparams.n_ctx = params.n_ctx; - model.hparams.n_embd = config.dim; //params.n_embd; - model.hparams.n_ff = config.hidden_dim; - model.hparams.n_mult = 32;//params.n_mult; - model.hparams.n_head = config.n_heads; //params.n_head; - model.hparams.n_layer = config.n_layers; //params.n_layer; - model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); + model.hparams.n_ctx = params.n_ctx; + model.hparams.n_embd = config.dim; //params.n_embd; + model.hparams.n_ff = config.hidden_dim; + model.hparams.n_mult = 32;//params.n_mult; + model.hparams.n_head = config.n_heads; //params.n_head; + model.hparams.n_head_kv = config.n_kv_heads; + model.hparams.n_layer = config.n_layers; //params.n_layer; + model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + print_params(&model.hparams); + struct ggml_init_params lcparams; lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); lcparams.mem_buffer = NULL; @@ -956,7 +929,7 @@ int main(int argc, char ** argv) { model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); - printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); + LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); return 0; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 3295cd240..536657526 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -7,6 +7,52 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +static std::vector split_lines(const std::string & s) { + std::string line; + std::vector lines; + std::stringstream ss(s); + while (std::getline(ss, line)) { + lines.push_back(line); + } + return lines; +} + +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { + for (size_t i = 0; i < tokens.size(); i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); + } +} + +static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { + // clear previous kv_cache values (irrelevant for embeddings) + llama_kv_cache_clear(ctx); + + // run model + fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + if (llama_decode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to decode\n", __func__); + } + + for (int i = 0; i < batch.n_tokens; i++) { + if (!batch.logits[i]) { + continue; + } + + // try to get sequence embeddings - supported only when pooling_type is not NONE + const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + if (embd == NULL) { + fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); + continue; + } + } + + float * out = output + batch.seq_id[i][0] * n_embd; + llama_embd_normalize(embd, out, n_embd); + } +} + int main(int argc, char ** argv) { gpt_params params; @@ -15,6 +61,8 @@ int main(int argc, char ** argv) { } params.embedding = true; + // For non-causal models, batch size must be equal to ubatch size + params.n_ubatch = params.n_batch; print_build_info(); @@ -29,7 +77,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; @@ -55,49 +104,107 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s\n", get_system_info(params).c_str()); } - int n_past = 0; + // split the prompt into lines + std::vector prompts = split_lines(params.prompt); - // tokenize the prompt - auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + // max batch size + const uint64_t n_batch = params.n_batch; + GGML_ASSERT(params.n_batch >= params.n_ctx); - if (params.verbose_prompt) { - fprintf(stderr, "\n"); - fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - for (int i = 0; i < (int) embd_inp.size(); i++) { - fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); - } - fprintf(stderr, "\n"); - } - - if (embd_inp.size() > (size_t)n_ctx) { - fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n", - __func__, embd_inp.size(), n_ctx); - return 1; - } - - while (!embd_inp.empty()) { - int n_tokens = std::min(params.n_batch, (int) embd_inp.size()); - if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); + // tokenize the prompts and trim + std::vector> inputs; + for (const auto & prompt : prompts) { + auto inp = ::llama_tokenize(ctx, prompt, true, false); + if (inp.size() > n_batch) { + fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + __func__, (long long int) inp.size(), (long long int) n_batch); return 1; } - n_past += n_tokens; - embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens); + inputs.push_back(inp); } + // add eos if not present + for (auto & inp : inputs) { + if (inp.empty() || inp.back() != llama_token_eos(model)) { + inp.push_back(llama_token_eos(model)); + } + } + + // tokenization stats + if (params.verbose_prompt) { + for (int i = 0; i < (int) inputs.size(); i++) { + fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); + for (int j = 0; j < (int) inputs[i].size(); j++) { + fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); + } + fprintf(stderr, "\n\n"); + } + } + + // initialize batch + const int n_prompts = prompts.size(); + struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + + // allocate output const int n_embd = llama_n_embd(model); - const auto * embeddings = llama_get_embeddings(ctx); + std::vector embeddings(n_prompts * n_embd, 0); + float * emb = embeddings.data(); - for (int i = 0; i < n_embd; i++) { - printf("%f ", embeddings[i]); + // break into batches + int p = 0; // number of prompts processed already + int s = 0; // number of prompts in current batch + for (int k = 0; k < n_prompts; k++) { + // clamp to n_batch tokens + auto & inp = inputs[k]; + + const uint64_t n_toks = inp.size(); + + // encode if at capacity + if (batch.n_tokens + n_toks > n_batch) { + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + llama_batch_clear(batch); + p += s; + s = 0; + } + + // add to batch + batch_add_seq(batch, inp, s); + s += 1; } - printf("\n"); + // final batch + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + + // print the first part of the embeddings or for a single prompt, the full embedding + fprintf(stdout, "\n"); + for (int j = 0; j < n_prompts; j++) { + fprintf(stdout, "embedding %d: ", j); + for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { + fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); + } + fprintf(stdout, "\n"); + } + + // print cosine similarity matrix + if (n_prompts > 1) { + fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "\n"); + } + } + + // clean up llama_print_timings(ctx); llama_free(ctx); llama_free_model(model); - llama_backend_free(); return 0; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 58fbe204d..08413f57e 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -7,8 +7,6 @@ #include #include -static const size_t tensor_alignment = 32; - struct lora_info { std::string filename; float scale; @@ -245,9 +243,8 @@ static struct lora_data * load_lora(struct lora_info * info) { params_ggml.no_alloc = true; result->ctx = ggml_init(params_ggml); - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' uint32_t magic = file.read_u32(); - if (magic != LLAMA_FILE_MAGIC_LORA) { + if (magic != LLAMA_FILE_MAGIC_GGLA) { die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); } uint32_t version = file.read_u32(); @@ -338,24 +335,14 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int params.mem_buffer = NULL; params.no_alloc = true; struct ggml_context * ctx = NULL; - struct ggml_allocr * alloc = NULL; - struct ggml_cgraph * gf = NULL; + struct ggml_gallocr * alloc = NULL; + struct ggml_cgraph * gf = NULL; ctx = ggml_init(params); - alloc = ggml_allocr_new_measure(tensor_alignment); + alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf); - ggml_allocr_free(alloc); - ggml_free(ctx); - static std::vector data_compute; - data_compute.resize(alloc_size + tensor_alignment); - - ctx = ggml_init(params); - alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment); - gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling); - ggml_allocr_alloc_graph(alloc, gf); - ggml_allocr_free(alloc); + ggml_gallocr_alloc_graph(alloc, gf); struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads); static std::vector data_work; @@ -364,6 +351,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int ggml_graph_compute(gf, &cplan); + ggml_gallocr_free(alloc); ggml_free(ctx); return true; } diff --git a/examples/finetune/README.md b/examples/finetune/README.md index a884706c5..2fafd505e 100644 --- a/examples/finetune/README.md +++ b/examples/finetune/README.md @@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these --rank-wk N LORA rank for wk tensor (default 4) --rank-wv N LORA rank for wv tensor (default 4) --rank-wo N LORA rank for wo tensor (default 4) - --rank-w1 N LORA rank for w1 tensor (default 4) - --rank-w2 N LORA rank for w2 tensor (default 4) - --rank-w3 N LORA rank for w3 tensor (default 4) + --rank-ffn_gate N LORA rank for ffn_gate tensor (default 4) + --rank-ffn_down N LORA rank for ffn_down tensor (default 4) + --rank-ffn_up N LORA rank for ffn_up tensor (default 4) ``` The LORA rank of 'norm' tensors should always be 1. diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index eaca42fc1..3da5317b3 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" #include "llama.h" #include "common.h" #include "train.h" @@ -13,8 +14,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const size_t tensor_alignment = 32; - struct my_llama_hparams { uint32_t n_vocab = 32000; uint32_t n_ctx = 512; @@ -61,9 +60,9 @@ struct my_llama_layer { struct ggml_tensor * ffn_norm; // ff - struct ggml_tensor * w1; - struct ggml_tensor * w2; - struct ggml_tensor * w3; + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 }; struct my_llama_model { @@ -86,9 +85,9 @@ struct my_llama_lora_hparams { uint32_t n_rank_wv = 4; uint32_t n_rank_wo = 4; uint32_t n_rank_ffn_norm = 1; - uint32_t n_rank_w1 = 4; - uint32_t n_rank_w2 = 4; - uint32_t n_rank_w3 = 4; + uint32_t n_rank_ffn_gate = 4; + uint32_t n_rank_ffn_down = 4; + uint32_t n_rank_ffn_up = 4; uint32_t n_rank_tok_embeddings = 4; uint32_t n_rank_norm = 1; uint32_t n_rank_output = 4; @@ -118,17 +117,17 @@ struct my_llama_lora_layer { struct ggml_tensor * ffn_norm_b; // ff - struct ggml_tensor * w1_a; - struct ggml_tensor * w1_b; - struct ggml_tensor * w2_a; - struct ggml_tensor * w2_b; - struct ggml_tensor * w3_a; - struct ggml_tensor * w3_b; + struct ggml_tensor * ffn_gate_a; + struct ggml_tensor * ffn_gate_b; + struct ggml_tensor * ffn_down_a; + struct ggml_tensor * ffn_down_b; + struct ggml_tensor * ffn_up_a; + struct ggml_tensor * ffn_up_b; }; struct my_llama_lora { struct ggml_context * ctx = NULL; - std::vector data; + ggml_backend_buffer_t data; my_llama_lora_hparams hparams; @@ -209,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) { printf("%s: n_rank_wv : %u\n", __func__, params->n_rank_wv); printf("%s: n_rank_wo : %u\n", __func__, params->n_rank_wo); printf("%s: n_rank_ffn_norm : %u\n", __func__, params->n_rank_ffn_norm); - printf("%s: n_rank_w1 : %u\n", __func__, params->n_rank_w1); - printf("%s: n_rank_w2 : %u\n", __func__, params->n_rank_w2); - printf("%s: n_rank_w3 : %u\n", __func__, params->n_rank_w3); + printf("%s: n_rank_ffn_gate : %u\n", __func__, params->n_rank_ffn_gate); + printf("%s: n_rank_ffn_down : %u\n", __func__, params->n_rank_ffn_down); + printf("%s: n_rank_ffn_up : %u\n", __func__, params->n_rank_ffn_up); printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings); printf("%s: n_rank_norm : %u\n", __func__, params->n_rank_norm); printf("%s: n_rank_output : %u\n", __func__, params->n_rank_output); @@ -320,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model layer.wv = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i)); layer.wo = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i)); layer.ffn_norm = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i)); - layer.w1 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); - layer.w2 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); - layer.w3 = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); + layer.ffn_gate = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i)); + layer.ffn_down = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i)); + layer.ffn_up = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i)); assert_shape_1d(layer.attention_norm, hparams.n_embd); assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd); @@ -330,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa()); assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd); assert_shape_1d(layer.ffn_norm, hparams.n_embd); - assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff); - assert_shape_2d(layer.w2, hparams.n_ff, hparams.n_embd); - assert_shape_2d(layer.w3, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_gate, hparams.n_embd, hparams.n_ff); + assert_shape_2d(layer.ffn_down, hparams.n_ff, hparams.n_embd); + assert_shape_2d(layer.ffn_up, hparams.n_embd, hparams.n_ff); } } @@ -363,69 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) { ggml_set_param(ctx, layer.wo_b); ggml_set_param(ctx, layer.ffn_norm_a); ggml_set_param(ctx, layer.ffn_norm_b); - ggml_set_param(ctx, layer.w1_a); - ggml_set_param(ctx, layer.w1_b); - ggml_set_param(ctx, layer.w2_a); - ggml_set_param(ctx, layer.w2_b); - ggml_set_param(ctx, layer.w3_a); - ggml_set_param(ctx, layer.w3_b); - } -} - -static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) { - ggml_allocr_alloc(alloc, lora->tok_embeddings_a); - ggml_allocr_alloc(alloc, lora->tok_embeddings_b); - ggml_allocr_alloc(alloc, lora->norm_a); - ggml_allocr_alloc(alloc, lora->norm_b); - ggml_allocr_alloc(alloc, lora->output_a); - ggml_allocr_alloc(alloc, lora->output_b); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - ggml_allocr_alloc(alloc, layer.attention_norm_a); - ggml_allocr_alloc(alloc, layer.attention_norm_b); - ggml_allocr_alloc(alloc, layer.wq_a); - ggml_allocr_alloc(alloc, layer.wq_b); - ggml_allocr_alloc(alloc, layer.wk_a); - ggml_allocr_alloc(alloc, layer.wk_b); - ggml_allocr_alloc(alloc, layer.wv_a); - ggml_allocr_alloc(alloc, layer.wv_b); - ggml_allocr_alloc(alloc, layer.wo_a); - ggml_allocr_alloc(alloc, layer.wo_b); - ggml_allocr_alloc(alloc, layer.ffn_norm_a); - ggml_allocr_alloc(alloc, layer.ffn_norm_b); - ggml_allocr_alloc(alloc, layer.w1_a); - ggml_allocr_alloc(alloc, layer.w1_b); - ggml_allocr_alloc(alloc, layer.w2_a); - ggml_allocr_alloc(alloc, layer.w2_b); - ggml_allocr_alloc(alloc, layer.w3_a); - ggml_allocr_alloc(alloc, layer.w3_b); - } - ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad); - ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad); - ggml_allocr_alloc(alloc, lora->norm_a->grad); - ggml_allocr_alloc(alloc, lora->norm_b->grad); - ggml_allocr_alloc(alloc, lora->output_a->grad); - ggml_allocr_alloc(alloc, lora->output_b->grad); - for (uint32_t i = 0; i < lora->layers.size(); ++i) { - auto & layer = lora->layers[i]; - ggml_allocr_alloc(alloc, layer.attention_norm_a->grad); - ggml_allocr_alloc(alloc, layer.attention_norm_b->grad); - ggml_allocr_alloc(alloc, layer.wq_a->grad); - ggml_allocr_alloc(alloc, layer.wq_b->grad); - ggml_allocr_alloc(alloc, layer.wk_a->grad); - ggml_allocr_alloc(alloc, layer.wk_b->grad); - ggml_allocr_alloc(alloc, layer.wv_a->grad); - ggml_allocr_alloc(alloc, layer.wv_b->grad); - ggml_allocr_alloc(alloc, layer.wo_a->grad); - ggml_allocr_alloc(alloc, layer.wo_b->grad); - ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad); - ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad); - ggml_allocr_alloc(alloc, layer.w1_a->grad); - ggml_allocr_alloc(alloc, layer.w1_b->grad); - ggml_allocr_alloc(alloc, layer.w2_a->grad); - ggml_allocr_alloc(alloc, layer.w2_b->grad); - ggml_allocr_alloc(alloc, layer.w3_a->grad); - ggml_allocr_alloc(alloc, layer.w3_b->grad); + ggml_set_param(ctx, layer.ffn_gate_a); + ggml_set_param(ctx, layer.ffn_gate_b); + ggml_set_param(ctx, layer.ffn_down_a); + ggml_set_param(ctx, layer.ffn_down_b); + ggml_set_param(ctx, layer.ffn_up_a); + ggml_set_param(ctx, layer.ffn_up_b); } } @@ -493,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd); layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1); - layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd); - layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff); - layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff); - layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd); - layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd); - layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff); + layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd); + layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff); + layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff); + layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd); + layer.ffn_up_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_embd); + layer.ffn_up_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up, n_ff); ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i)); @@ -512,28 +454,18 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora ggml_set_name(layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, ".weight.lora_b", i)); ggml_set_name(layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_a", i)); ggml_set_name(layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, ".weight.lora_b", i)); - ggml_set_name(layer.w1_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); - ggml_set_name(layer.w1_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); - ggml_set_name(layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); - ggml_set_name(layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); - ggml_set_name(layer.w3_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); - ggml_set_name(layer.w3_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, ".weight.lora_b", i)); + ggml_set_name(layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, ".weight.lora_a", i)); + ggml_set_name(layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, ".weight.lora_b", i)); } set_param_lora(lora); - // measure data size - size_t size = 0; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { - size += GGML_PAD(ggml_nbytes(t), tensor_alignment); - } - - // allocate data - struct ggml_allocr * alloc = NULL; - lora->data.resize(size + tensor_alignment); - alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment); - alloc_lora(alloc, lora); - ggml_allocr_free(alloc); + // allocate data for lora tensors + lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); } static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) { @@ -565,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl randomize_tensor_normal(layer.ffn_norm_a, rnd); ggml_set_zero(layer.ffn_norm_b); - randomize_tensor_normal(layer.w1_a, rnd); - ggml_set_zero(layer.w1_b); - randomize_tensor_normal(layer.w2_a, rnd); - ggml_set_zero(layer.w2_b); - randomize_tensor_normal(layer.w3_a, rnd); - ggml_set_zero(layer.w3_b); + randomize_tensor_normal(layer.ffn_gate_a, rnd); + ggml_set_zero(layer.ffn_gate_b); + randomize_tensor_normal(layer.ffn_down_a, rnd); + ggml_set_zero(layer.ffn_down_b); + randomize_tensor_normal(layer.ffn_up_a, rnd); + ggml_set_zero(layer.ffn_up_b); } free_random_normal_distribution(rnd); @@ -579,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl static struct ggml_tensor * llama_build_lora_finetune_graphs( struct my_llama_model * model, struct my_llama_lora * lora, - struct ggml_allocr * alloc, + ggml_gallocr_t alloc, struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, @@ -590,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( const int n_tokens, const int n_batch, const bool enable_flash_attn, - const bool enable_checkpointing) { + const bool enable_checkpointing, + const bool measure_only) { ggml_set_scratch(ctx, { 0, 0, nullptr, }); const int n_past = 0; @@ -622,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); - ggml_allocr_alloc(alloc, KQ_pos); - if (!ggml_allocr_is_measure(alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < N; ++i) { - data[i] = n_past + i; - } - } + ggml_set_input(KQ_pos); // rope has so much parameters that we make a custom function for it auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale] @@ -683,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b)); struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b)); - struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); - struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); - struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); - struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); - struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b)); - struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b)); - struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b)); + struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b)); + struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b)); + struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b)); + struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b)); + struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b)); + struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b)); + struct ggml_tensor * ffn_up = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b)); struct ggml_tensor * t02 = ggml_rms_norm (ctx, cur, rms_norm_eps); set_name(t02, "t02"); assert_shape_2d(t02, n_embd, N*n_batch); struct ggml_tensor * t03 = ggml_repeat (ctx, attention_norm, t02); set_name(t03, "t03"); assert_shape_2d(t03, n_embd, N*n_batch); @@ -732,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( struct ggml_tensor * t22 = ggml_rms_norm (ctx, t21, rms_norm_eps); set_name(t22, "t22"); assert_shape_2d(t22, n_embd, N*n_batch); struct ggml_tensor * t23 = ggml_repeat (ctx, ffn_norm, t22); set_name(t23, "t23"); assert_shape_2d(t23, n_embd, N*n_batch); struct ggml_tensor * t24 = ggml_mul (ctx, t23, t22); set_name(t24, "t24"); assert_shape_2d(t24, n_embd, N*n_batch); - struct ggml_tensor * t25 = ggml_mul_mat (ctx, w3, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); - struct ggml_tensor * t26 = ggml_mul_mat (ctx, w1, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); + struct ggml_tensor * t25 = ggml_mul_mat (ctx, ffn_up, t24); set_name(t25, "t25"); assert_shape_2d(t25, n_ff, N*n_batch); + struct ggml_tensor * t26 = ggml_mul_mat (ctx, ffn_gate, t24); set_name(t26, "t26"); assert_shape_2d(t26, n_ff, N*n_batch); struct ggml_tensor * t27 = ggml_silu (ctx, t26); set_name(t27, "t27"); assert_shape_2d(t27, n_ff, N*n_batch); struct ggml_tensor * t28 = ggml_mul (ctx, t27, t25); set_name(t28, "t28"); assert_shape_2d(t28, n_ff, N*n_batch); - struct ggml_tensor * t29 = ggml_mul_mat (ctx, w2, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); + struct ggml_tensor * t29 = ggml_mul_mat (ctx, ffn_down, t28); set_name(t29, "t29"); assert_shape_2d(t29, n_embd, N*n_batch); struct ggml_tensor * t30 = ggml_add (ctx, t29, t21); set_name(t30, "t30"); assert_shape_2d(t30, n_embd, N*n_batch); cur = t30; if (enable_checkpointing) { @@ -780,7 +707,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( // input gradient ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f)); GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL); - ggml_allocr_alloc(alloc, t36->grad); + ggml_set_input(t36->grad); // KQ_pos ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f)); @@ -796,20 +723,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f)); ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f)); - ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f)); + ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f)); } // allocating checkpoints in one block to reduce memory fragmentation // note: they will be freed in reverse order for (unsigned int i = 0; i < checkpoints.size(); ++i) { if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) { - ggml_allocr_alloc(alloc, checkpoints[i]); + ggml_set_input(checkpoints[i]); } } - ggml_allocr_alloc_graph(alloc, gb); + if (measure_only) { + ggml_gallocr_reserve(alloc, gb); + } else { + ggml_gallocr_alloc_graph(alloc, gb); + + // set KQ_pos + { + int * data = (int *) KQ_pos->data; + for (int i = 0; i < N; ++i) { + data[i] = n_past + i; + } + } + } // remove the additional nodes and leafs for (int i = n_leafs_before; i < gb->n_leafs; ++i) { @@ -859,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V); GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT); GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); - GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN); + GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP); init_lora(model, lora); @@ -886,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context copy_tensor_by_name(layer.wo_b, f_ggml_ctx, ggml_get_name(layer.wo_b)); copy_tensor_by_name(layer.ffn_norm_a, f_ggml_ctx, ggml_get_name(layer.ffn_norm_a)); copy_tensor_by_name(layer.ffn_norm_b, f_ggml_ctx, ggml_get_name(layer.ffn_norm_b)); - copy_tensor_by_name(layer.w1_a, f_ggml_ctx, ggml_get_name(layer.w1_a)); - copy_tensor_by_name(layer.w1_b, f_ggml_ctx, ggml_get_name(layer.w1_b)); - copy_tensor_by_name(layer.w2_a, f_ggml_ctx, ggml_get_name(layer.w2_a)); - copy_tensor_by_name(layer.w2_b, f_ggml_ctx, ggml_get_name(layer.w2_b)); - copy_tensor_by_name(layer.w3_a, f_ggml_ctx, ggml_get_name(layer.w3_a)); - copy_tensor_by_name(layer.w3_b, f_ggml_ctx, ggml_get_name(layer.w3_b)); + copy_tensor_by_name(layer.ffn_gate_a, f_ggml_ctx, ggml_get_name(layer.ffn_gate_a)); + copy_tensor_by_name(layer.ffn_gate_b, f_ggml_ctx, ggml_get_name(layer.ffn_gate_b)); + copy_tensor_by_name(layer.ffn_down_a, f_ggml_ctx, ggml_get_name(layer.ffn_down_a)); + copy_tensor_by_name(layer.ffn_down_b, f_ggml_ctx, ggml_get_name(layer.ffn_down_b)); + copy_tensor_by_name(layer.ffn_up_a, f_ggml_ctx, ggml_get_name(layer.ffn_up_a)); + copy_tensor_by_name(layer.ffn_up_b, f_ggml_ctx, ggml_get_name(layer.ffn_up_b)); } } @@ -929,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V, lora->hparams.n_rank_wv); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, lora->hparams.n_rank_wo); gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM, lora->hparams.n_rank_ffn_norm); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_w1); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_w2); - gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_w3); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE, lora->hparams.n_rank_ffn_gate); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, lora->hparams.n_rank_ffn_down); + gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP, lora->hparams.n_rank_ffn_up); gguf_add_tensor(fctx, lora->tok_embeddings_a); gguf_add_tensor(fctx, lora->tok_embeddings_b); @@ -955,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod gguf_add_tensor(fctx, layer.wo_b); gguf_add_tensor(fctx, layer.ffn_norm_a); gguf_add_tensor(fctx, layer.ffn_norm_b); - gguf_add_tensor(fctx, layer.w1_a); - gguf_add_tensor(fctx, layer.w1_b); - gguf_add_tensor(fctx, layer.w2_a); - gguf_add_tensor(fctx, layer.w2_b); - gguf_add_tensor(fctx, layer.w3_a); - gguf_add_tensor(fctx, layer.w3_b); + gguf_add_tensor(fctx, layer.ffn_gate_a); + gguf_add_tensor(fctx, layer.ffn_gate_b); + gguf_add_tensor(fctx, layer.ffn_down_a); + gguf_add_tensor(fctx, layer.ffn_down_b); + gguf_add_tensor(fctx, layer.ffn_up_a); + gguf_add_tensor(fctx, layer.ffn_up_b); } } @@ -1138,9 +1077,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor return tn_buf.data(); }; - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' // write_magic - file.write_u32(LLAMA_FILE_MAGIC_LORA); // magic + file.write_u32(LLAMA_FILE_MAGIC_GGLA); // magic file.write_u32(1); // version // write_hparams file.write_u32(lora->hparams.lora_r); @@ -1166,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor write_tensor(&file, layer.wo_b, tni(LLM_TENSOR_ATTN_OUT, i, ".weight.loraB")); write_tensor(&file, layer.ffn_norm_a, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraA")); write_tensor(&file, layer.ffn_norm_b, tni(LLM_TENSOR_FFN_NORM, i, ".weight.loraB")); - write_tensor(&file, layer.w1_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); - write_tensor(&file, layer.w1_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); - write_tensor(&file, layer.w2_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); - write_tensor(&file, layer.w2_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); - write_tensor(&file, layer.w3_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); - write_tensor(&file, layer.w3_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_gate_a, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_gate_b, tni(LLM_TENSOR_FFN_GATE, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_down_a, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_down_b, tni(LLM_TENSOR_FFN_DOWN, i, ".weight.loraB")); + write_tensor(&file, layer.ffn_up_a, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraA")); + write_tensor(&file, layer.ffn_up_b, tni(LLM_TENSOR_FFN_UP, i, ".weight.loraB")); } } @@ -1201,9 +1139,9 @@ struct train_params { uint32_t n_rank_wv; uint32_t n_rank_wo; uint32_t n_rank_ffn_norm; - uint32_t n_rank_w1; - uint32_t n_rank_w2; - uint32_t n_rank_w3; + uint32_t n_rank_ffn_gate; + uint32_t n_rank_ffn_down; + uint32_t n_rank_ffn_up; uint32_t n_rank_tok_embeddings; uint32_t n_rank_norm; uint32_t n_rank_output; @@ -1214,9 +1152,9 @@ struct train_params { bool custom_n_rank_wv; bool custom_n_rank_wo; bool custom_n_rank_ffn_norm; - bool custom_n_rank_w1; - bool custom_n_rank_w2; - bool custom_n_rank_w3; + bool custom_n_rank_ffn_gate; + bool custom_n_rank_ffn_down; + bool custom_n_rank_ffn_up; bool custom_n_rank_tok_embeddings; bool custom_n_rank_norm; bool custom_n_rank_output; @@ -1248,9 +1186,9 @@ static struct train_params get_default_train_params() { params.n_rank_wv = 4; params.n_rank_wo = 4; params.n_rank_ffn_norm = 1; - params.n_rank_w1 = 4; - params.n_rank_w2 = 4; - params.n_rank_w3 = 4; + params.n_rank_ffn_gate = 4; + params.n_rank_ffn_down = 4; + params.n_rank_ffn_up = 4; params.n_rank_tok_embeddings = 4; params.n_rank_norm = 1; params.n_rank_output = 4; @@ -1261,9 +1199,9 @@ static struct train_params get_default_train_params() { params.custom_n_rank_wv = false; params.custom_n_rank_wo = false; params.custom_n_rank_ffn_norm = false; - params.custom_n_rank_w1 = false; - params.custom_n_rank_w2 = false; - params.custom_n_rank_w3 = false; + params.custom_n_rank_ffn_gate = false; + params.custom_n_rank_ffn_down = false; + params.custom_n_rank_ffn_up = false; params.custom_n_rank_tok_embeddings = false; params.custom_n_rank_norm = false; params.custom_n_rank_output = false; @@ -1294,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params fprintf(stderr, " --rank-wk N LORA rank for wk tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wv N LORA rank for wv tensor, overrides default rank.\n"); fprintf(stderr, " --rank-wo N LORA rank for wo tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w1 N LORA rank for w1 tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w2 N LORA rank for w2 tensor, overrides default rank.\n"); - fprintf(stderr, " --rank-w3 N LORA rank for w3 tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_gate N LORA rank for ffn_gate tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_down N LORA rank for ffn_down tensor, overrides default rank.\n"); + fprintf(stderr, " --rank-ffn_up N LORA rank for ffn_up tensor, overrides default rank.\n"); print_common_train_usage(argc, argv, ¶ms->common); } @@ -1431,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par } params->n_rank_wo = std::stoi(argv[i]); params->custom_n_rank_wo = true; - } else if (arg == "--rank-w1") { + } else if (arg == "--rank-ffn_gate") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w1 = std::stoi(argv[i]); - params->custom_n_rank_w1 = true; - } else if (arg == "--rank-w2") { + params->n_rank_ffn_gate = std::stoi(argv[i]); + params->custom_n_rank_ffn_gate = true; + } else if (arg == "--rank-ffn_down") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w2 = std::stoi(argv[i]); - params->custom_n_rank_w2 = true; - } else if (arg == "--rank-w3") { + params->n_rank_ffn_down = std::stoi(argv[i]); + params->custom_n_rank_ffn_down = true; + } else if (arg == "--rank-ffn_up") { if (++i >= argc) { invalid_param = true; break; } - params->n_rank_w3 = std::stoi(argv[i]); - params->custom_n_rank_w3 = true; + params->n_rank_ffn_up = std::stoi(argv[i]); + params->custom_n_rank_ffn_up = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); train_print_usage(argc, argv, &default_params); @@ -1514,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) { nx += ggml_nelements(layer.wo_b); nx += ggml_nelements(layer.ffn_norm_a); nx += ggml_nelements(layer.ffn_norm_b); - nx += ggml_nelements(layer.w1_a); - nx += ggml_nelements(layer.w1_b); - nx += ggml_nelements(layer.w2_a); - nx += ggml_nelements(layer.w2_b); - nx += ggml_nelements(layer.w3_a); - nx += ggml_nelements(layer.w3_b); + nx += ggml_nelements(layer.ffn_gate_a); + nx += ggml_nelements(layer.ffn_gate_b); + nx += ggml_nelements(layer.ffn_down_a); + nx += ggml_nelements(layer.ffn_down_b); + nx += ggml_nelements(layer.ffn_up_a); + nx += ggml_nelements(layer.ffn_up_b); } return nx; } @@ -1573,9 +1511,9 @@ int main(int argc, char ** argv) { uint32_t n_rank_wv = params.custom_n_rank_wv ? params.n_rank_wv : params.lora_r; uint32_t n_rank_wo = params.custom_n_rank_wo ? params.n_rank_wo : params.lora_r; uint32_t n_rank_ffn_norm = params.custom_n_rank_ffn_norm ? params.n_rank_ffn_norm : 1; - uint32_t n_rank_w1 = params.custom_n_rank_w1 ? params.n_rank_w1 : params.lora_r; - uint32_t n_rank_w2 = params.custom_n_rank_w2 ? params.n_rank_w2 : params.lora_r; - uint32_t n_rank_w3 = params.custom_n_rank_w3 ? params.n_rank_w3 : params.lora_r; + uint32_t n_rank_ffn_gate = params.custom_n_rank_ffn_gate ? params.n_rank_ffn_gate : params.lora_r; + uint32_t n_rank_ffn_down = params.custom_n_rank_ffn_down ? params.n_rank_ffn_down : params.lora_r; + uint32_t n_rank_ffn_up = params.custom_n_rank_ffn_up ? params.n_rank_ffn_up : params.lora_r; uint32_t n_rank_tok_embeddings = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r; uint32_t n_rank_norm = params.custom_n_rank_norm ? params.n_rank_norm : 1; uint32_t n_rank_output = params.custom_n_rank_output ? params.n_rank_output : params.lora_r; @@ -1585,15 +1523,15 @@ int main(int argc, char ** argv) { lora.hparams.n_rank_wv = n_rank_wv; lora.hparams.n_rank_wo = n_rank_wo; lora.hparams.n_rank_ffn_norm = n_rank_ffn_norm; - lora.hparams.n_rank_w1 = n_rank_w1; - lora.hparams.n_rank_w2 = n_rank_w2; - lora.hparams.n_rank_w3 = n_rank_w3; + lora.hparams.n_rank_ffn_gate = n_rank_ffn_gate; + lora.hparams.n_rank_ffn_down = n_rank_ffn_down; + lora.hparams.n_rank_ffn_up = n_rank_ffn_up; lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings; lora.hparams.n_rank_norm = n_rank_norm; lora.hparams.n_rank_output = n_rank_output; // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; @@ -1628,9 +1566,9 @@ int main(int argc, char ** argv) { || (lora.hparams.n_rank_wv != n_rank_wv) || (lora.hparams.n_rank_wo != n_rank_wo) || (lora.hparams.n_rank_ffn_norm != n_rank_ffn_norm) - || (lora.hparams.n_rank_w1 != n_rank_w1) - || (lora.hparams.n_rank_w2 != n_rank_w2) - || (lora.hparams.n_rank_w3 != n_rank_w3) + || (lora.hparams.n_rank_ffn_gate != n_rank_ffn_gate) + || (lora.hparams.n_rank_ffn_down != n_rank_ffn_down) + || (lora.hparams.n_rank_ffn_up != n_rank_ffn_up) || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings) || (lora.hparams.n_rank_norm != n_rank_norm) || (lora.hparams.n_rank_output != n_rank_output) @@ -1664,7 +1602,7 @@ int main(int argc, char ** argv) { printf("%s: seen train_samples %llu\n", __func__, (long long unsigned) train->train_samples); printf("%s: seen train_tokens %llu\n", __func__, (long long unsigned) train->train_tokens); printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs); - printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f)); + printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f)); if (params.only_write_lora) { save_train_files_data save_data; @@ -1691,10 +1629,6 @@ int main(int argc, char ** argv) { int n_vocab = model.hparams.n_vocab; int n_batch = params.common.n_batch; - - std::vector mem_input_data; - std::vector mem_compute_data; - // context for input tensors without their data struct ggml_init_params ctx_input_params = { ggml_tensor_overhead() * 2, // mem_size @@ -1707,17 +1641,11 @@ int main(int argc, char ** argv) { struct ggml_tensor * tokens_input = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch); struct ggml_tensor * target_probs = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab, n_tokens, n_batch); - // measure required memory for input tensors - size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) + - GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) + - tensor_alignment; - printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); - // allocate input tensors - mem_input_data.resize(max_input_size); - ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment); - ggml_allocr_alloc(alloc_inps, tokens_input); - ggml_allocr_alloc(alloc_inps, target_probs); + // measure required memory for input tensors + ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type()); + size_t max_input_size = ggml_backend_buffer_get_size(input_data); + printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f)); // context for compute tensors without their data const size_t estimated_compute_size_wo_data = ( @@ -1744,7 +1672,7 @@ int main(int argc, char ** argv) { // find best evaluation order for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) { ctx_compute = ggml_init(ctx_compute_params); - ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = (enum ggml_cgraph_eval_order) order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1757,14 +1685,15 @@ int main(int argc, char ** argv) { &logits, tokens_input, target_probs, n_tokens, n_batch, params.common.use_flash, - params.common.use_checkpointing + params.common.use_checkpointing, + true ); - size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment; + size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer if (max_compute_size < best_compute_size) { best_compute_size = max_compute_size; best_order = gf->order; } - ggml_allocr_free(alloc); + ggml_gallocr_free(alloc); ggml_free(ctx_compute); } size_t max_compute_size = best_compute_size; @@ -1775,9 +1704,8 @@ int main(int argc, char ** argv) { "invalid"); // allocate compute tensors - mem_compute_data.resize(max_compute_size); ctx_compute = ggml_init(ctx_compute_params); - ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment); + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); gf->order = best_order; gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true); @@ -1790,17 +1718,17 @@ int main(int argc, char ** argv) { &logits, tokens_input, target_probs, n_tokens, n_batch, params.common.use_flash, - params.common.use_checkpointing + params.common.use_checkpointing, + false ); - ggml_allocr_free(alloc); - ggml_allocr_free(alloc_inps); - // tokenize data std::vector train_tokens; std::vector train_samples_begin; std::vector train_samples_size; - printf("%s: tokenize training data\n", __func__); + printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data); + printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str()); + printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false"); tokenize_file(lctx, params.common.fn_train_data, params.common.sample_start, @@ -1907,6 +1835,8 @@ int main(int argc, char ** argv) { ggml_free(ctx_work); ggml_free(ctx_compute); ggml_free(ctx_input); + ggml_gallocr_free(alloc); + int64_t t1 = ggml_time_ms(); printf("%s: total training time: ", __func__); diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt new file mode 100644 index 000000000..166e3ad2a --- /dev/null +++ b/examples/gbnf-validator/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gbnf-validator) +add_executable(${TARGET} gbnf-validator.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp new file mode 100644 index 000000000..e4c0c1689 --- /dev/null +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -0,0 +1,132 @@ +#define LLAMA_API_INTERNAL + +#include "grammar-parser.h" +#include "ggml.h" +#include "llama.h" +#include "unicode.h" + +#include +#include +#include +#include + +static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { + auto decoded = decode_utf8(input_str, {}); + const auto & code_points = decoded.first; + + size_t pos = 0; + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + auto prev_stacks = grammar->stacks; + grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + if (grammar->stacks.empty()) { + error_pos = pos; + error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; + grammar->stacks = prev_stacks; + return false; + } + ++pos; + } + + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + return true; + } + } + + error_pos = pos; + error_msg = "Unexpected end of input"; + return false; +} + +static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) { + fprintf(stdout, "Input string is invalid according to the grammar.\n"); + fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos); + fprintf(stdout, "\n"); + fprintf(stdout, "Input string:\n"); + fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str()); + if (error_pos < input_str.size()) { + fprintf(stdout, "\033[1;31m%c", input_str[error_pos]); + if (error_pos+1 < input_str.size()) { + fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str()); + } + fprintf(stdout, "\033[0m\n"); + } +} + +int main(int argc, char** argv) { + if (argc != 3) { + fprintf(stdout, "Usage: %s \n", argv[0]); + return 1; + } + + const std::string grammar_filename = argv[1]; + const std::string input_filename = argv[2]; + + // Read the GBNF grammar file + FILE* grammar_file = fopen(grammar_filename.c_str(), "r"); + if (!grammar_file) { + fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str()); + return 1; + } + + fseek(grammar_file, 0, SEEK_END); + size_t grammar_size = ftell(grammar_file); + fseek(grammar_file, 0, SEEK_SET); + + std::string grammar_str(grammar_size, ' '); + fread(&grammar_str[0], 1, grammar_size, grammar_file); + fclose(grammar_file); + + // Parse the GBNF grammar + auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + fprintf(stdout, "%s: failed to parse grammar\n", __func__); + return 1; + } + + // Ensure that there is a "root" node. + if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) { + fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__); + return 1; + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + + // Create the LLAMA grammar + auto grammar = llama_grammar_init( + grammar_rules.data(), + grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + + // Read the input file + FILE* input_file = fopen(input_filename.c_str(), "r"); + if (!input_file) { + fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str()); + return 1; + } + + fseek(input_file, 0, SEEK_END); + size_t input_size = ftell(input_file); + fseek(input_file, 0, SEEK_SET); + + std::string input_str(input_size, ' '); + fread(&input_str[0], 1, input_size, input_file); + fclose(input_file); + + // Validate the input string against the grammar + size_t error_pos; + std::string error_msg; + bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg); + + if (is_valid) { + fprintf(stdout, "Input string is valid according to the grammar.\n"); + } else { + print_error_message(input_str, error_pos, error_msg); + } + + // Clean up + llama_grammar_free(grammar); + + return 0; +} diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt new file mode 100644 index 000000000..828e62435 --- /dev/null +++ b/examples/gguf-split/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gguf-split) +add_executable(${TARGET} gguf-split.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md new file mode 100644 index 000000000..ddb1f7649 --- /dev/null +++ b/examples/gguf-split/README.md @@ -0,0 +1,9 @@ +## GGUF split Example + +CLI to split / merge GGUF files. + +**Command line options:** + +- `--split`: split GGUF to multiple GGUF, default operation. +- `--split-max-tensors`: maximum tensors in each split: default(128) +- `--merge`: merge multiple GGUF to a single GGUF. diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp new file mode 100644 index 000000000..24acbf02a --- /dev/null +++ b/examples/gguf-split/gguf-split.cpp @@ -0,0 +1,553 @@ +#include "llama.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if defined(_WIN32) + #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif + #include +#endif + +enum split_operation : uint8_t { + SPLIT_OP_SPLIT, + SPLIT_OP_MERGE, +}; + +struct split_params { + split_operation operation = SPLIT_OP_SPLIT; + size_t n_bytes_split = 0; + int n_split_tensors = 128; + std::string input; + std::string output; + bool dry_run = false; +}; + +static void split_print_usage(const char * executable) { + const split_params default_params; + printf("\n"); + printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable); + printf("\n"); + printf("Apply a GGUF operation on IN to OUT."); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --version show version and build info\n"); + printf(" --split split GGUF to multiple GGUF (enabled by default)\n"); + printf(" --merge merge multiple GGUF to a single GGUF\n"); + printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); + printf(" --split-max-size N(M|G) max size per split\n"); + printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); + printf("\n"); +} + +// return convert string, for example "128M" or "4G" to number of bytes +static size_t split_str_to_n_bytes(std::string str) { + size_t n_bytes = 0; + int n; + if (str.back() == 'M') { + sscanf(str.c_str(), "%d", &n); + n_bytes = n * 1024 * 1024; // megabytes + } else if (str.back() == 'G') { + sscanf(str.c_str(), "%d", &n); + n_bytes = n * 1024 * 1024 * 1024; // gigabytes + } else { + throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); + } + if (n <= 0) { + throw std::invalid_argument("error: size must be a positive value"); + } + return n_bytes; +} + +static void split_params_parse_ex(int argc, const char ** argv, split_params & params) { + std::string arg; + const std::string arg_prefix = "--"; + bool invalid_param = false; + + int arg_idx = 1; + for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { + arg = argv[arg_idx]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + bool arg_found = false; + bool is_op_set = false; + bool is_mode_set = false; + if (arg == "-h" || arg == "--help") { + split_print_usage(argv[0]); + exit(0); + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + if (arg == "--dry-run") { + arg_found = true; + params.dry_run = true; + } + + if (is_op_set) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } + if (arg == "--merge") { + arg_found = true; + is_op_set = true; + params.operation = SPLIT_OP_MERGE; + } + if (arg == "--split") { + arg_found = true; + is_op_set = true; + params.operation = SPLIT_OP_SPLIT; + } + + if (is_mode_set) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } + if (arg == "--split-max-tensors") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + is_mode_set = true; + params.n_split_tensors = atoi(argv[arg_idx]); + } + if (arg == "--split-max-size") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + is_mode_set = true; + params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); + } + + if (!arg_found) { + throw std::invalid_argument("error: unknown argument: " + arg); + } + } + + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } + + if (argc - arg_idx < 2) { + throw std::invalid_argument("error: bad arguments"); + } + + params.input = argv[arg_idx++]; + params.output = argv[arg_idx++]; +} + +static bool split_params_parse(int argc, const char ** argv, split_params & params) { + bool result = true; + try { + split_params_parse_ex(argc, argv, params); + } + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + split_print_usage(argv[0]); + exit(EXIT_FAILURE); + } + return result; +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +struct split_strategy { + const split_params params; + std::ifstream & f_input; + struct gguf_context * ctx_gguf; + struct ggml_context * ctx_meta = NULL; + const int n_tensors; + + // one ctx_out per one output file + std::vector ctx_outs; + + // temporary buffer for reading in tensor data + std::vector read_buf; + + split_strategy(const split_params & params, + std::ifstream & f_input, + struct gguf_context * ctx_gguf, + struct ggml_context * ctx_meta) : + params(params), + f_input(f_input), + ctx_gguf(ctx_gguf), + ctx_meta(ctx_meta), + n_tensors(gguf_get_n_tensors(ctx_gguf)) { + + // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits + int i_split = -1; + struct gguf_context * ctx_out = NULL; + auto new_ctx_out = [&]() { + i_split++; + if (ctx_out != NULL) { + if (gguf_get_n_tensors(ctx_out) == 0) { + fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); + exit(EXIT_FAILURE); + } + ctx_outs.push_back(ctx_out); + } + ctx_out = gguf_init_empty(); + // Save all metadata in first split only + if (i_split == 0) { + gguf_set_kv(ctx_out, ctx_gguf); + } + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder + gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors); + }; + + // initialize ctx_out for the first split + new_ctx_out(); + + // process tensors one by one + size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) + for (int i = 0; i < n_tensors; ++i) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + // calculate the "imaginary" size = the current size + next tensor size + size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); + size_t next_tensors_size = curr_tensors_size + n_bytes; + if (should_split(i, next_tensors_size)) { + new_ctx_out(); + curr_tensors_size = n_bytes; + } else { + curr_tensors_size = next_tensors_size; + } + gguf_add_tensor(ctx_out, t); + } + + // push the last ctx_out + ctx_outs.push_back(ctx_out); + + // set the correct n_split for all ctx_out + for (auto & ctx : ctx_outs) { + gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size()); + } + } + + ~split_strategy() { + for (auto & ctx_out : ctx_outs) { + gguf_free(ctx_out); + } + } + + bool should_split(int i_tensor, size_t next_size) { + if (params.n_bytes_split > 0) { + // split by max size per file + return next_size > params.n_bytes_split; + } else { + // split by number of tensors per file + return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; + } + } + + void print_info() { + printf("n_split: %ld\n", ctx_outs.size()); + int i_split = 0; + for (auto & ctx_out : ctx_outs) { + // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) + size_t total_size = gguf_get_meta_size(ctx_out); + for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i)); + total_size += ggml_nbytes(t); + } + total_size = total_size / 1024 / 1024; // convert to megabytes + printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + i_split++; + } + } + + void write() { + int i_split = 0; + int n_split = ctx_outs.size(); + for (auto & ctx_out : ctx_outs) { + // construct file path + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); + + // open the output file + printf("Writing file %s ... ", split_path); + fflush(stdout); + std::ofstream fout = std::ofstream(split_path, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + // write metadata + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + // write tensors + for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { + // read tensor meta and prepare buffer + const char * t_name = gguf_get_tensor_name(ctx_out, i); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + auto n_bytes = ggml_nbytes(t); + read_buf.resize(n_bytes); + + // calculate offset + auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + + // copy tensor from input to output file + copy_file_to_file(f_input, fout, offset, n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + + printf("done\n"); + // close the file + fout.close(); + i_split++; + } + } + + void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) { + // TODO: detect OS and use copy_file_range() here for better performance + if (read_buf.size() < len) { + read_buf.resize(len); + } + f_in.seekg(in_offset); + f_in.read((char *)read_buf.data(), len); + f_out.write((const char *)read_buf.data(), len); + } +}; + +static void gguf_split(const split_params & split_params) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + std::ifstream f_input(split_params.input.c_str(), std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(EXIT_FAILURE); + } + + auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(EXIT_FAILURE); + } + + // prepare the strategy + split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); + int n_split = strategy.ctx_outs.size(); + strategy.print_info(); + + if (!split_params.dry_run) { + // write all output splits + strategy.write(); + } + + // done, clean up + gguf_free(ctx_gguf); + f_input.close(); + + fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", + __func__, n_split, strategy.n_tensors); +} + +static void gguf_merge(const split_params & split_params) { + fprintf(stderr, "%s: %s -> %s\n", + __func__, split_params.input.c_str(), + split_params.output.c_str()); + int n_split = 1; + int total_tensors = 0; + + auto * ctx_out = gguf_init_empty(); + std::ofstream fout(split_params.output.c_str(), std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + std::vector read_data; + std::vector ctx_metas; + std::vector ctx_ggufs; + + char split_path[PATH_MAX] = {0}; + strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1); + char split_prefix[PATH_MAX] = {0}; + + // First pass to find KV and tensors metadata + for (int i_split = 0; i_split < n_split; i_split++) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + if (i_split > 0) { + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + } + fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path); + + auto * ctx_gguf = gguf_init_from_file(split_path, params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(EXIT_FAILURE); + } + ctx_ggufs.push_back(ctx_gguf); + ctx_metas.push_back(ctx_meta); + + if (i_split == 0) { + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); + if (key_n_split < 0) { + fprintf(stderr, + "\n%s: input file does not contain %s metadata\n", + __func__, + LLM_KV_SPLIT_COUNT); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + gguf_free(ctx_out); + fout.close(); + exit(EXIT_FAILURE); + } + + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); + if (n_split < 1) { + fprintf(stderr, + "\n%s: input file does not contain a valid split count %d\n", + __func__, + n_split); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + gguf_free(ctx_out); + fout.close(); + exit(EXIT_FAILURE); + } + + // Verify the file naming and extract split_prefix + if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { + fprintf(stderr, "\n%s: unexpected input file name: %s" + " i_split=%d" + " n_split=%d\n", __func__, + split_path, i_split, n_split); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + gguf_free(ctx_out); + fout.close(); + exit(EXIT_FAILURE); + } + + // Do not trigger merge if we try to merge again the output + gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0); + + // Set metadata from the first split + gguf_set_kv(ctx_out, ctx_gguf); + } + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + gguf_add_tensor(ctx_out, t); + } + total_tensors += n_tensors; + + fprintf(stderr, "\033[3Ddone\n"); + } + + // placeholder for the meta data + { + auto meta_size = gguf_get_meta_size(ctx_out); + ::zeros(fout, meta_size); + } + + // Write tensors data + for (int i_split = 0; i_split < n_split; i_split++) { + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + std::ifstream f_input(split_path, std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path); + for (uint32_t i = 0; i < ctx_ggufs.size(); i++) { + gguf_free(ctx_ggufs[i]); + ggml_free(ctx_metas[i]); + } + gguf_free(ctx_out); + fout.close(); + exit(EXIT_FAILURE); + } + fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path); + + auto * ctx_gguf = ctx_ggufs[i_split]; + auto * ctx_meta = ctx_metas[i_split]; + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + + auto n_bytes = ggml_nbytes(t); + + if (read_data.size() < n_bytes) { + read_data.resize(n_bytes); + } + + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); + + // write tensor data + padding + fout.write((const char *)read_data.data(), n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + f_input.close(); + fprintf(stderr, "\033[3Ddone\n"); + } + + { + // go back to beginning of file and write the updated metadata + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + fout.close(); + gguf_free(ctx_out); + } + + fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n", + __func__, split_params.output.c_str(), n_split, total_tensors); +} + +int main(int argc, const char ** argv) { + split_params params; + split_params_parse(argc, argv, params); + + switch (params.operation) { + case SPLIT_OP_SPLIT: gguf_split(params); + break; + case SPLIT_OP_MERGE: gguf_merge(params); + break; + default: split_print_usage(argv[0]); + exit(EXIT_FAILURE); + } + + return 0; +} diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index e67be4fb2..5444503a5 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) { for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]); + gguf_free(ctx); return false; } } diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt new file mode 100644 index 000000000..ac4a5ae79 --- /dev/null +++ b/examples/gritlm/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gritlm) +add_executable(${TARGET} gritlm.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md new file mode 100644 index 000000000..64cc19204 --- /dev/null +++ b/examples/gritlm/README.md @@ -0,0 +1,62 @@ +## Generative Representational Instruction Tuning (GRIT) Example +[gritlm] a model which can generate embeddings as well as "normal" text +generation depending on the instructions in the prompt. + +* Paper: https://arxiv.org/pdf/2402.09906.pdf + +### Retrieval-Augmented Generation (RAG) use case +One use case for `gritlm` is to use it with RAG. If we recall how RAG works is +that we take documents that we want to use as context, to ground the large +language model (LLM), and we create token embeddings for them. We then store +these token embeddings in a vector database. + +When we perform a query, prompt the LLM, we will first create token embeddings +for the query and then search the vector database to retrieve the most +similar vectors, and return those documents so they can be passed to the LLM as +context. Then the query and the context will be passed to the LLM which will +have to _again_ create token embeddings for the query. But because gritlm is used +the first query can be cached and the second query tokenization generation does +not have to be performed at all. + +### Running the example +Download a Grit model: +```console +$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf +``` + +Run the example using the downloaded model: +```console +$ ./gritlm -m gritlm-7b_q4_1.gguf + +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 +Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 +Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112 +Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547 + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +When shadows lurk and ghosts do roam, +And darkness reigns, a fearsome sight. + +Thou didst set out, with heart aglow, +To conquer this mountain, so high, +And reach the summit, where the stars do glow, +And the moon shines bright, up in the sky. + +Through the mist and fog, thou didst press on, +With steadfast courage, and a steadfast will, +Through the darkness, thou didst not be gone, +But didst climb on, with a steadfast skill. + +At last, thou didst reach the summit's crest, +And gazed upon the world below, +And saw the beauty of the night's best, +And felt the peace, that only nature knows. + +Oh, brave adventurer, who dared to climb +The lofty peak of Mt. Fuji in the night, +Thou art a hero, in the eyes of all, +For thou didst conquer this mountain, so bright. +``` + +[gritlm]: https://github.com/ContextualAI/gritlm diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp new file mode 100644 index 000000000..52fd719b3 --- /dev/null +++ b/examples/gritlm/gritlm.cpp @@ -0,0 +1,215 @@ +#include "common.h" +#include "llama.h" + +#include +#include + +// #define GRIT_DEBUG + +static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { + std::vector> result; + + const llama_model * mdl = llama_get_model(ctx); + + llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); + + for (uint64_t i = 0; i < sentences.size(); i++) { + llama_batch_clear(batch); + + const std::string input_string = instruction + sentences[i]; + + std::vector inputs = llama_tokenize(mdl, input_string, true, false); + + const int32_t n_toks = inputs.size(); + + // GritLM seems to have EOS = "" + // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 + // inputs.push_back(llama_token_eos(mdl)); + + // we want to ignore instruction tokens for mean pooling + const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size(); + +#ifdef GRIT_DEBUG + // debug tokens - should be matching as referenced in the GritLM sample + std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) { + std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str()); + }); + std::printf("\n"); +#endif + + // add input to batch (this increments n_tokens) + for (int32_t j = 0; j < n_toks; j++) { + llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); + } + + // clear previous kv_cache values (irrelevant for embeddings) + llama_kv_cache_clear(ctx); + llama_set_causal_attn(ctx, false); + + // run model + llama_decode(ctx, batch); + + // get embedding dimensions + uint64_t n_embd = llama_n_embd(mdl); + + // allocate embedding output + std::vector emb_unorm(n_embd, 0.0f); + + // sum up all token embeddings + for (int32_t k = n_inst; k < n_toks; k++) { + float * emb = llama_get_embeddings_ith(ctx, k); + for (uint64_t j = 0; j < n_embd; j++) { + emb_unorm[j] += emb[j]; + } + } + + // divide by number of tokens (mean pooling) + { + const uint64_t n_sent = n_toks - n_inst; + + for (uint64_t j = 0; j < n_embd; j++) { + emb_unorm[j] /= n_sent; + } + } + + std::vector emb_norm(emb_unorm.size()); + llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); + result.push_back(emb_norm); + +#ifdef GRIT_DEBUG + // print out emb_norm + std::printf("embedding %ld: ", i); + for (uint64_t j = 0; j < n_embd; j++) { + std::printf("%.5f ", emb_norm[j]); + } + std::printf("\n\n"); +#endif + } + + llama_batch_free(batch); + + return result; +} + +static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) { + std::string result; + + const llama_model * mdl = llama_get_model(ctx); + llama_token eos_token = llama_token_eos(mdl); + + llama_kv_cache_clear(ctx); + llama_set_causal_attn(ctx, true); + llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); + + std::vector inputs = llama_tokenize(mdl, prompt, false, true); + int32_t i_current_token = 0; + + while (true) { + llama_batch_clear(bat); + auto n_inputs = (int32_t)inputs.size(); + for (int32_t i = 0; i < n_inputs; i++) { + llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); + } + inputs.clear(); + + llama_decode(ctx, bat); + auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1); + + auto candidates = std::vector(llama_n_vocab(mdl)); + auto n_candidates = (int32_t)candidates.size(); + for (int32_t token = 0; token < n_candidates; token++) { + candidates[token] = llama_token_data{ token, logits[token], 0.0f }; + } + auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false }; + + llama_token token = llama_sample_token_greedy(ctx, &candidates_p); + if (token == eos_token) { + break; + } + + std::string piece = llama_token_to_piece(ctx, token); + if (stream) { + std::printf("%s", piece.c_str()); + std::fflush(stdout); + } + + inputs.push_back(token); + + result += piece; + } + + if (stream) { + std::printf("\n"); + } + + llama_batch_free(bat); + + return result; +} + +static std::string gritlm_instruction(const std::string & instruction) { + return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n"; +} + +int main(int argc, char * argv[]) { + gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + llama_model_params mparams = llama_model_params_from_gpt_params(params); + llama_context_params cparams = llama_context_params_from_gpt_params(params); + + llama_backend_init(); + + llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); + + // create new context - set to embedding mode + cparams.embeddings = true; + llama_context * ctx = llama_new_context_with_model(mdl, cparams); + + // ### Embedding/Representation ### + // samples taken from: https://github.com/ContextualAI/gritlm#basic + { + const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract"; + + const std::vector queries = { + "Bitcoin: A Peer-to-Peer Electronic Cash System", + "Generative Representational Instruction Tuning", + }; + + const std::vector documents = { + "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.", + "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.", + }; + + // No need to add instruction for retrieval documents + const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); + const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); + + const int n_embd = llama_n_embd(mdl); + + const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd); + const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd); + const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd); + + std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); + std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); + std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0); + std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1); + } + + // ### Generation ### + // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction + { + const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n"; + std::string response = generate(ctx, prompt, true); + } + + llama_free(ctx); + llama_free_model(mdl); + llama_backend_free(); + + return 0; +} diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt new file mode 100644 index 000000000..d688a1620 --- /dev/null +++ b/examples/imatrix/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET imatrix) +add_executable(${TARGET} imatrix.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md new file mode 100644 index 000000000..458c01b87 --- /dev/null +++ b/examples/imatrix/README.md @@ -0,0 +1,32 @@ +# llama.cpp/examples/imatrix + +Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models. +More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861 + +## Usage + +``` +./imatrix -m -f [-o ] [--verbosity ] + [-ofreq num_chunks] [-ow <0 or 1>] [other common params] +``` + +Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. +The parameters in square brackets are optional and have the following meaning: +* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. +* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. +* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) +* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. + +For faster computation, make sure to use GPU offloading via the `-ngl` argument + +## Example + +```bash +LLAMA_CUDA=1 make -j + +# generate importance matrix (imatrix.dat) +./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 + +# use the imatrix to perform a Q4_K_M quantization +./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m +``` diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp new file mode 100644 index 000000000..d8cb0a642 --- /dev/null +++ b/examples/imatrix/imatrix.cpp @@ -0,0 +1,647 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +struct Stats { + std::vector values; + int ncall = 0; +}; + +struct StatParams { + std::string ofile = "imatrix.dat"; + int n_output_frequency = 10; + int verbosity = 1; + int keep_every = 0; + bool collect_output_weight = false; +}; + +class IMatrixCollector { +public: + IMatrixCollector() = default; + void set_parameters(StatParams&& params) { m_params = std::move(params); } + bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); + void save_imatrix() const; + bool load_imatrix(const char * file_name, bool add); + static bool load_imatrix(const char * file_name, std::unordered_map& imatrix); +private: + std::unordered_map m_stats; + StatParams m_params; + std::mutex m_mutex; + int m_last_call = 0; + std::vector m_src1_data; + std::vector m_ids; // the expert ids from ggml_mul_mat_id + // + void save_imatrix(const char * file_name) const; + void keep_imatrix(int ncall) const; +}; + +// remove any prefix and suffixes from the name +// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight +static std::string filter_tensor_name(const char * name) { + std::string wname; + const char * p = strchr(name, '#'); + if (p != NULL) { + p = p + 1; + const char * q = strchr(p, '#'); + if (q != NULL) { + wname = std::string(p, q - p); + } else { + wname = p; + } + } else { + wname = name; + } + return wname; +} + +bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + GGML_UNUSED(user_data); + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + std::string wname = filter_tensor_name(src0->name); + + // when ask is true, the scheduler wants to know if we are interested in data from this tensor + // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection + if (ask) { + if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications + if (t->op != GGML_OP_MUL_MAT) return false; + if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; + if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; + return true; + } + + std::lock_guard lock(m_mutex); + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(src1->buffer); + + if (!is_host) { + m_src1_data.resize(ggml_nelements(src1)); + ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1)); + } + + const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); + + // this has been adapted to the new format of storing merged experts in a single 3d tensor + // ref: https://github.com/ggerganov/llama.cpp/pull/6387 + if (t->op == GGML_OP_MUL_MAT_ID) { + const int idx = ((int32_t *) t->op_params)[0]; + const ggml_tensor * ids = t->src[2]; + const int n_as = src0->ne[2]; + + // the top-k selected expert ids are stored in the ids tensor + // for simplicity, always copy ids to host, because it is small + // take into account that ids is not contiguous! + GGML_ASSERT(ids->ne[1] == src1->ne[1]); + GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int))); + m_ids.resize(ggml_nbytes(ids)/sizeof(int)); + ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); + + auto & e = m_stats[wname]; + + ++e.ncall; + // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger + // using the following line, we can correct for that if needed by replacing the line above with: + //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; + + // loop over all possible experts, regardless if they are used or not in the batch + for (int ex = 0; ex < n_as; ++ex) { + size_t e_start = ex*src1->ne[0]; + if (e.values.empty()) { + e.values.resize(src1->ne[0]*n_as, 0); + } + else if (e.values.size() != (size_t)src1->ne[0]*n_as) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); + exit(1); //GGML_ASSERT(false); + } + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const int excur = m_ids[row*n_as + idx]; + GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check + if (excur != ex) continue; + const float * x = data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[e_start + j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } + if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { + keep_imatrix(m_last_call); + } + } + } + } else { + auto& e = m_stats[wname]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const float * x = data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } + if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { + keep_imatrix(m_last_call); + } + } + } + + return true; +} + +void IMatrixCollector::save_imatrix() const { + save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str()); +} + +void IMatrixCollector::keep_imatrix(int ncall) const { + auto file_name = m_params.ofile; + if (file_name.empty()) file_name = "imatrix.dat"; + file_name += ".at_"; + file_name += std::to_string(ncall); + save_imatrix(file_name.c_str()); +} + +void IMatrixCollector::save_imatrix(const char * fname) const { + std::ofstream out(fname, std::ios::binary); + int n_entries = m_stats.size(); + out.write((const char*)&n_entries, sizeof(n_entries)); + for (auto& p : m_stats) { + int len = p.first.size(); + out.write((const char*)&len, sizeof(len)); + out.write(p.first.c_str(), len); + out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); + int nval = p.second.values.size(); + out.write((const char*)&nval, sizeof(nval)); + if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); + } + if (m_params.verbosity > 0) { + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); + } +} + +bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map& imatrix_data) { + std::ifstream in(imatrix_file, std::ios::binary); + if (!in) { + printf("%s: failed to open %s\n",__func__,imatrix_file); + return false; + } + int n_entries; + in.read((char*)&n_entries, sizeof(n_entries)); + if (in.fail() || n_entries < 1) { + printf("%s: no data in file %s\n", __func__, imatrix_file); + return false; + } + for (int i = 0; i < n_entries; ++i) { + int len; in.read((char *)&len, sizeof(len)); + std::vector name_as_vec(len+1); + in.read((char *)name_as_vec.data(), len); + if (in.fail()) { + printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file); + return false; + } + name_as_vec[len] = 0; + std::string name{name_as_vec.data()}; + auto& e = imatrix_data[std::move(name)]; + int ncall; + in.read((char*)&ncall, sizeof(ncall)); + int nval; + in.read((char *)&nval, sizeof(nval)); + if (in.fail() || nval < 1) { + printf("%s: failed reading number of values for entry %d\n",__func__,i); + imatrix_data = {}; + return false; + } + e.values.resize(nval); + in.read((char*)e.values.data(), nval*sizeof(float)); + if (in.fail()) { + printf("%s: failed reading data for entry %d\n",__func__,i); + imatrix_data = {}; + return false; + } + e.ncall = ncall; + } + return true; +} + +bool IMatrixCollector::load_imatrix(const char * file_name, bool add) { + if (!add) { + m_stats.clear(); + } + return load_imatrix(file_name, m_stats); +} + +static IMatrixCollector g_collector; + +static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { + return g_collector.collect_imatrix(t, ask, user_data); +} + + +struct results_log_softmax { + double log_softmax; + float logit; + float prob; +}; + +static std::vector softmax(const std::vector& logits) { + std::vector probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) { + max_logit = std::max(max_logit, v); + } + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + const float logit = logits[i] - max_logit; + const float exp_logit = expf(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) { + probs[i] /= sum_exp; + } + return probs; +} + +static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { + float max_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) { + max_logit = std::max(max_logit, logits[i]); + } + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) { + sum_exp += expf(logits[i] - max_logit); + } + return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; +} + +static void process_logits( + int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, + double & nll, double & nll2, float * logit_history, float * prob_history +) { + std::mutex mutex; + int counter = 0; + auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { + double local_nll = 0; + double local_nll2 = 0; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + nll += local_nll; nll2 += local_nll2; + break; + } + lock.unlock(); + const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const double v = -results.log_softmax; + local_nll += v; + local_nll2 += v*v; + + logit_history[i] = results.logit; + prob_history[i] = results.prob; + } + }; + for (auto & w : workers) { + w = std::thread(compute); + } + compute(); + for (auto & w : workers) { + w.join(); + } +} + +static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { + + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + const int n_ctx = llama_n_ctx(ctx); + + auto tim1 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + auto tim2 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + + if (from_chunk > 0) { + if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) { + fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk); + return false; + } + fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx); + tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx); + } + + if (int(tokens.size()) < 2*n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, + n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + return false; + } + + std::vector logit_history; + std::vector prob_history; + + if (compute_ppl) { + logit_history.resize(tokens.size()); + prob_history.resize(tokens.size()); + } + + const int n_chunk_max = tokens.size() / n_ctx; + + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_batch = params.n_batch; + + int count = 0; + double nll = 0.0; + double nll2 = 0.0; + + fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); + + std::vector workers(std::thread::hardware_concurrency() - 1); + + const int num_batches = (n_ctx + n_batch - 1) / n_batch; + + std::vector logits; + if (compute_ppl && num_batches > 1) { + logits.reserve((size_t)n_ctx * n_vocab); + } + + for (int i = 0; i < n_chunk; ++i) { + const int start = i * n_ctx; + const int end = start + n_ctx; + + std::vector logits; + + const auto t_start = std::chrono::high_resolution_clock::now(); + + // clear the KV cache + llama_kv_cache_clear(ctx); + + for (int j = 0; j < num_batches; ++j) { + const int batch_start = start + j * n_batch; + const int batch_size = std::min(end - batch_start, n_batch); + + // save original token and restore it after eval + const auto token_org = tokens[batch_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + } + + // TODO: use batch.logits to save computations instead of relying on logits_all == true + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + + // restore the original token in case it was set to BOS + tokens[batch_start] = token_org; + + if (compute_ppl && num_batches > 1) { + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + } + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + + if (i == 0) { + const float t_total = std::chrono::duration(t_end - t_start).count(); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + int total_seconds = (int)(t_total * n_chunk); + if (total_seconds >= 60*60) { + fprintf(stderr, "%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + } + + if (compute_ppl) { + const int first = n_ctx/2; + const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); + process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + count += n_ctx - first - 1; + + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + + logits.clear(); + } + } + printf("\n"); + + if (compute_ppl) { + nll2 /= count; + nll /= count; + const double ppl = exp(nll); + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + printf("Unexpected negative standard deviation of log(prob)\n"); + } + } + + return true; +} + +int main(int argc, char ** argv) { + + StatParams sparams; + std::string prev_result_file; + std::string combine_files; + bool compute_ppl = true; + int from_chunk = 0; + std::vector args; + args.push_back(argv[0]); + int iarg = 1; + for (; iarg < argc-1; ++iarg) { + std::string arg{argv[iarg]}; + if (arg == "-o" || arg == "--output-file") { + sparams.ofile = argv[++iarg]; + } + else if (arg == "-ofreq" || arg == "--output-frequency") { + sparams.n_output_frequency = std::stoi(argv[++iarg]); + } + else if (arg == "-ow" || arg == "--output-weight") { + sparams.collect_output_weight = std::stoi(argv[++iarg]); + } + else if (arg == "--verbosity") { + sparams.verbosity = std::stoi(argv[++iarg]); + } else if (arg == "--no-ppl") { + compute_ppl = false; + } else if (arg == "--keep-imatrix") { + sparams.keep_every = std::stoi(argv[++iarg]); + } else if (arg == "--continue-from") { + prev_result_file = argv[++iarg]; + } else if (arg == "--combine") { + combine_files = argv[++iarg]; + } + else if (arg == "--from-chunk") { + from_chunk = std::stoi(argv[++iarg]); + } else { + args.push_back(argv[iarg]); + } + } + if (iarg < argc) { + std::string arg{argv[iarg]}; + if (arg == "--no-ppl") { + compute_ppl = false; + } else { + args.push_back(argv[iarg]); + } + } + + g_collector.set_parameters(std::move(sparams)); + + if (!combine_files.empty()) { + std::vector files; + size_t pos = 0; + while (true) { + auto new_pos = combine_files.find(',', pos); + if (new_pos != std::string::npos) { + files.emplace_back(combine_files.substr(pos, new_pos - pos)); + pos = new_pos + 1; + } else { + files.emplace_back(combine_files.substr(pos)); + break; + } + } + if (files.size() < 2) { + fprintf(stderr, "You must provide at least two comma separated files to use --combine\n"); + return 1; + } + printf("Combining the following %d files\n", int(files.size())); + for (auto& file : files) { + printf(" %s\n", file.c_str()); + if (!g_collector.load_imatrix(file.c_str(), true)) { + fprintf(stderr, "Failed to load %s\n", file.c_str()); + return 1; + } + } + g_collector.save_imatrix(); + return 0; + } + + if (!prev_result_file.empty()) { + if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) { + fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str()); + return 1; + } + } + + gpt_params params; + params.n_batch = 512; + if (!gpt_params_parse(args.size(), args.data(), params)) { + return 1; + } + + params.logits_all = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); + + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model_params mparams = llama_model_params_from_gpt_params(params); + + llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); + if (model == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return 1; + } + + llama_context_params cparams = llama_context_params_from_gpt_params(params); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + cparams.cb_eval = ik_collect_imatrix; + cparams.cb_eval_user_data = NULL; + + llama_context * ctx = llama_new_context_with_model(model, cparams); + if (ctx == NULL) { + fprintf(stderr, "%s: error: unable to create context\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + if (params.n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); + } + + bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); + if (!OK) { + return 1; + } + + g_collector.save_imatrix(); + + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 4a7827876..91c39c5ae 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -202,7 +202,8 @@ int main(int argc, char ** argv) { std::mt19937 rng(params.seed); LOG("%s: llama backend init\n", __func__); - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); llama_model * model; llama_context * ctx; @@ -241,7 +242,7 @@ int main(int argc, char ** argv) { LOG("add_bos: %d\n", add_bos); bool suff_rm_leading_spc = params.escape; - if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { params.input_suffix.erase(0, 1); suff_rm_leading_spc = false; } @@ -377,10 +378,10 @@ int main(int argc, char ** argv) { if (params.interactive) { const char *control_message; if (params.multiline_input) { - control_message = " - To return control to LLaMa, end your input with '\\'.\n" + control_message = " - To return control to LLaMA, end your input with '\\'.\n" " - To return control without starting a new line, end your input with '/'.\n"; } else { - control_message = " - Press Return to return control to LLaMa.\n" + control_message = " - Press Return to return control to LLaMA.\n" " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } @@ -446,8 +447,8 @@ int main(int argc, char ** argv) { LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/json-schema-pydantic-example.py b/examples/json-schema-pydantic-example.py new file mode 100644 index 000000000..69ebfd409 --- /dev/null +++ b/examples/json-schema-pydantic-example.py @@ -0,0 +1,74 @@ +# Usage: +#! ./server -m some-model.gguf & +#! pip install pydantic +#! python json-schema-pydantic-example.py + +from pydantic import BaseModel, TypeAdapter +from annotated_types import MinLen +from typing import Annotated, List, Optional +import json, requests + +if True: + + def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs): + ''' + Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support + (llama.cpp server, llama-cpp-python, Anyscale / Together...) + + The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below) + ''' + if response_model: + type_adapter = TypeAdapter(response_model) + schema = type_adapter.json_schema() + messages = [{ + "role": "system", + "content": f"You respond in JSON format with the following schema: {json.dumps(schema, indent=2)}" + }] + messages + response_format={"type": "json_object", "schema": schema} + + data = requests.post(endpoint, headers={"Content-Type": "application/json"}, + json=dict(messages=messages, response_format=response_format, **kwargs)).json() + if 'error' in data: + raise Exception(data['error']['message']) + + content = data["choices"][0]["message"]["content"] + return type_adapter.validate_json(content) if type_adapter else content + +else: + + # This alternative branch uses Instructor + OpenAI client lib. + # Instructor support streamed iterable responses, retry & more. + # (see https://python.useinstructor.com/) + #! pip install instructor openai + import instructor, openai + client = instructor.patch( + openai.OpenAI(api_key="123", base_url="http://localhost:8080"), + mode=instructor.Mode.JSON_SCHEMA) + create_completion = client.chat.completions.create + + +if __name__ == '__main__': + + class QAPair(BaseModel): + question: str + concise_answer: str + justification: str + + class PyramidalSummary(BaseModel): + title: str + summary: str + question_answers: Annotated[List[QAPair], MinLen(2)] + sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]] + + print("# Summary\n", create_completion( + model="...", + response_model=PyramidalSummary, + messages=[{ + "role": "user", + "content": f""" + You are a highly efficient corporate document summarizer. + Create a pyramidal summary of an imaginary internal document about our company processes + (starting high-level, going down to each sub sections). + Keep questions short, and answers even shorter (trivia / quizz style). + """ + }])) diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py index 2a4cb65bc..91dd734cc 100755 --- a/examples/json-schema-to-grammar.py +++ b/examples/json-schema-to-grammar.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 import argparse +import itertools import json import re import sys +from typing import Any, Dict, List, Set, Tuple, Union # whitespace is constrained to a single space char to prevent model "running away" in # whitespace. Also maybe improves generation quality? @@ -12,26 +14,54 @@ PRIMITIVE_RULES = { 'boolean': '("true" | "false") space', 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space', + 'value' : 'object | array | string | number | boolean', + 'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', + 'array' : '"[" space ( value ("," space value)* )? "]" space', + 'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space', 'string': r''' "\"" ( [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) - )* "\"" space ''', + )* "\"" space''', 'null': '"null" space', } +OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'] + +# TODO: support "uri", "email" string formats +DATE_RULES = { + 'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', + 'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', + 'date-time': 'date "T" time', + 'date-string': '"\\"" date "\\"" space', + 'time-string': '"\\"" time "\\"" space', + 'date-time-string': '"\\"" date-time "\\"" space', +} + +RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()]) INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+') GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]') -GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'} +GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]') +GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'} +NON_LITERAL_SET = set('|.()[]{}*+?') +ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') + +DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])' +TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits class SchemaConverter: - def __init__(self, prop_order): + def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): self._prop_order = prop_order + self._allow_fetch = allow_fetch + self._dotall = dotall + self._raw_pattern = raw_pattern self._rules = {'space': SPACE_RULE} + self._refs = {} + self._refs_being_resolved = set() def _format_literal(self, literal): escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( - lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal) + lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal ) return f'"{escaped}"' @@ -41,64 +71,420 @@ class SchemaConverter: key = esc_name else: i = 0 - while f'{esc_name}{i}' in self._rules: + while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule: i += 1 key = f'{esc_name}{i}' self._rules[key] = rule return key + def resolve_refs(self, schema: dict, url: str): + ''' + Resolves all $ref fields in the given schema, fetching any remote schemas, + replacing $ref with absolute reference URL and populating self._refs with the + respective referenced (sub)schema dictionaries. + ''' + def visit(n: dict): + if isinstance(n, list): + return [visit(x) for x in n] + elif isinstance(n, dict): + ref = n.get('$ref') + if ref is not None and ref not in self._refs: + if ref.startswith('https://'): + assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)' + import requests + + frag_split = ref.split('#') + base_url = frag_split[0] + + target = self._refs.get(base_url) + if target is None: + target = self.resolve_refs(requests.get(ref).json(), base_url) + self._refs[base_url] = target + + if len(frag_split) == 1 or frag_split[-1] == '': + return target + elif ref.startswith('#/'): + target = schema + ref = f'{url}{ref}' + n['$ref'] = ref + else: + raise ValueError(f'Unsupported ref {ref}') + + for sel in ref.split('#')[-1].split('/')[1:]: + assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}' + target = target[sel] + + self._refs[ref] = target + else: + for v in n.values(): + visit(v) + + return n + return visit(schema) + + def _generate_union_rule(self, name, alt_schemas): + return ' | '.join(( + self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}') + for i, alt_schema in enumerate(alt_schemas) + )) + + def _visit_pattern(self, pattern, name): + ''' + Transforms a regular expression pattern into a GBNF rule. + + Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions + Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md + + Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers. + + Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which + we define sub-rules to keep the output lean. + ''' + + assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"' + pattern = pattern[1:-1] + sub_rule_ids = {} + + i = 0 + length = len(pattern) + + def to_rule(s: Tuple[str, bool]) -> str: + (txt, is_literal) = s + return "\"" + txt + "\"" if is_literal else txt + + def transform() -> Tuple[str, bool]: + ''' + Parse a unit at index i (advancing it), and return its string representation + whether it's a literal. + ''' + nonlocal i + nonlocal pattern + nonlocal sub_rule_ids + + start = i + # For each component of this sequence, store its string representation and whether it's a literal. + # We only need a flat structure here to apply repetition operators to the last item, and + # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially + # (GBNF's syntax is luckily very close to regular expressions!) + seq: list[Tuple[str, bool]] = [] + + def get_dot(): + if self._dotall: + rule = '[\\U00000000-\\U0010FFFF]' + else: + # Accept any character... except \n and \r line break chars (\x0A and \xOD) + rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]' + return self._add_rule(f'dot', rule) + + def join_seq(): + nonlocal seq + ret = [] + for is_literal, g in itertools.groupby(seq, lambda x: x[1]): + if is_literal: + ret.append((''.join(x[0] for x in g), True)) + else: + ret.extend(g) + if len(ret) == 1: + return ret[0] + return (' '.join(to_rule(x) for x in seq), False) + + while i < length: + c = pattern[i] + if c == '.': + seq.append((get_dot(), False)) + i += 1 + elif c == '(': + i += 1 + if i < length: + assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/' + seq.append((f'({to_rule(transform())})', False)) + elif c == ')': + i += 1 + assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}' + return join_seq() + elif c == '[': + square_brackets = c + i += 1 + while i < length and pattern[i] != ']': + if pattern[i] == '\\': + square_brackets += pattern[i:i+2] + i += 2 + else: + square_brackets += pattern[i] + i += 1 + assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}' + square_brackets += ']' + i += 1 + seq.append((square_brackets, False)) + elif c == '|': + seq.append(('|', False)) + i += 1 + elif c in ('*', '+', '?'): + seq[-1] = (to_rule(seq[-1]) + c, False) + i += 1 + elif c == '{': + curly_brackets = c + i += 1 + while i < length and pattern[i] != '}': + curly_brackets += pattern[i] + i += 1 + assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}' + curly_brackets += '}' + i += 1 + nums = [s.strip() for s in curly_brackets[1:-1].split(',')] + min_times = 0 + max_times = None + try: + if len(nums) == 1: + min_times = int(nums[0]) + max_times = min_times + else: + assert len(nums) == 2 + min_times = int(nums[0]) if nums[0] else 0 + max_times = int(nums[1]) if nums[1] else None + except ValueError: + raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/') + + (sub, sub_is_literal) = seq[-1] + + if min_times == 0 and max_times is None: + seq[-1] = (f'{sub}*', False) + elif min_times == 0 and max_times == 1: + seq[-1] = (f'{sub}?', False) + elif min_times == 1 and max_times is None: + seq[-1] = (f'{sub}+', False) + else: + if not sub_is_literal: + id = sub_rule_ids.get(sub) + if id is None: + id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub) + sub_rule_ids[sub] = id + sub = id + + seq[-1] = ( + ' '.join( + ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) + + ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])), + False + ) + else: + literal = '' + while i < length: + if pattern[i] == '\\' and i < length - 1: + next = pattern[i + 1] + if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS: + i += 1 + literal += pattern[i] + i += 1 + else: + literal += pattern[i:i+2] + i += 2 + elif pattern[i] == '"' and not self._raw_pattern: + literal += '\\"' + i += 1 + elif pattern[i] not in NON_LITERAL_SET and \ + (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET): + literal += pattern[i] + i += 1 + else: + break + if literal: + seq.append((literal, True)) + + return join_seq() + + return self._add_rule( + name, + to_rule(transform()) if self._raw_pattern \ + else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space") + + + def _resolve_ref(self, ref): + ref_name = ref.split('/')[-1] + if ref_name not in self._rules and ref not in self._refs_being_resolved: + self._refs_being_resolved.add(ref) + resolved = self._refs[ref] + ref_name = self.visit(resolved, ref_name) + self._refs_being_resolved.remove(ref) + return ref_name + + def _generate_constant_rule(self, value): + return self._format_literal(json.dumps(value)) + def visit(self, schema, name): schema_type = schema.get('type') - rule_name = name or 'root' + schema_format = schema.get('format') + rule_name = name + '-' if name in RESERVED_NAMES else name or 'root' - if 'oneOf' in schema or 'anyOf' in schema: - rule = ' | '.join(( - self.visit(alt_schema, f'{name}{"-" if name else ""}{i}') - for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf']) - )) - return self._add_rule(rule_name, rule) + if (ref := schema.get('$ref')) is not None: + return self._add_rule(rule_name, self._resolve_ref(ref)) + + elif 'oneOf' in schema or 'anyOf' in schema: + return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf'])) + + elif isinstance(schema_type, list): + return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type])) elif 'const' in schema: - return self._add_rule(rule_name, self._format_literal(schema['const'])) + return self._add_rule(rule_name, self._generate_constant_rule(schema['const'])) elif 'enum' in schema: - rule = ' | '.join((self._format_literal(v) for v in schema['enum'])) + rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) return self._add_rule(rule_name, rule) - elif schema_type == 'object' and 'properties' in schema: - # TODO: `required` keyword - prop_order = self._prop_order - prop_pairs = sorted( - schema['properties'].items(), - # sort by position in prop_order (if specified) then by key - key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]), + elif schema_type in (None, 'object') and \ + ('properties' in schema or \ + ('additionalProperties' in schema and schema['additionalProperties'] is not True)): + required = set(schema.get('required', [])) + properties = list(schema.get('properties', {}).items()) + return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties'))) + + elif schema_type in (None, 'object') and 'allOf' in schema: + required = set() + properties = [] + hybrid_name = name + def add_component(comp_schema, is_required): + if (ref := comp_schema.get('$ref')) is not None: + comp_schema = self._refs[ref] + + if 'properties' in comp_schema: + for prop_name, prop_schema in comp_schema['properties'].items(): + properties.append((prop_name, prop_schema)) + if is_required: + required.add(prop_name) + + for t in schema['allOf']: + if 'anyOf' in t: + for tt in t['anyOf']: + add_component(tt, is_required=False) + else: + add_component(t, is_required=True) + + return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[])) + + elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema): + items = schema.get('items') or schema['prefixItems'] + if isinstance(items, list): + return self._add_rule( + rule_name, + '"[" space ' + + ' "," space '.join( + self.visit(item, f'{name}{"-" if name else ""}tuple-{i}') + for i, item in enumerate(items)) + + ' "]" space') + else: + item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') + list_item_operator = f'( "," space {item_rule_name} )' + successive_items = "" + min_items = schema.get("minItems", 0) + max_items = schema.get("maxItems") + if min_items > 0: + successive_items = list_item_operator * (min_items - 1) + min_items -= 1 + if max_items is not None and max_items > min_items: + successive_items += (list_item_operator + "?") * (max_items - min_items - 1) + else: + successive_items += list_item_operator + "*" + if min_items == 0: + rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space' + else: + rule = f'"[" space {item_rule_name} {successive_items} "]" space' + return self._add_rule(rule_name, rule) + + elif schema_type in (None, 'string') and 'pattern' in schema: + return self._visit_pattern(schema['pattern'], rule_name) + + elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''): + return self._add_rule( + 'root' if rule_name == 'root' else schema_format, + PRIMITIVE_RULES['uuid'] ) - rule = '"{" space' - for i, (prop_name, prop_schema) in enumerate(prop_pairs): - prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}') - if i > 0: - rule += ' "," space' - rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}' - rule += ' "}" space' + elif schema_type in (None, 'string') and schema_format in DATE_RULES: + for t, r in DATE_RULES.items(): + self._add_rule(t, r) + return schema_format + '-string' - return self._add_rule(rule_name, rule) - - elif schema_type == 'array' and 'items' in schema: - # TODO `prefixItems` keyword - item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item') - rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space' - return self._add_rule(rule_name, rule) + elif (schema_type == 'object') or (len(schema) == 0): + for n in OBJECT_RULE_NAMES: + self._add_rule(n, PRIMITIVE_RULES[n]) + return self._add_rule(rule_name, 'object') else: assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' + # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero return self._add_rule( 'root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type] ) + def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]): + prop_order = self._prop_order + # sort by position in prop_order (if specified) then by original order + sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))] + + prop_kv_rule_names = {} + for prop_name, prop_schema in properties: + prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}') + prop_kv_rule_names[prop_name] = self._add_rule( + f'{name}{"-" if name else ""}{prop_name}-kv', + fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}' + ) + required_props = [k for k in sorted_props if k in required] + optional_props = [k for k in sorted_props if k not in required] + + if additional_properties == True or isinstance(additional_properties, dict): + sub_name = f'{name}{"-" if name else ""}additional' + value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value') + prop_kv_rule_names["*"] = self._add_rule( + f'{sub_name}-kv', + self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}' + ) + optional_props.append("*") + + rule = '"{" space ' + rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props) + + if optional_props: + rule += ' (' + if required_props: + rule += ' "," space ( ' + + def get_recursive_refs(ks, first_is_optional): + [k, *rest] = ks + kv_rule_name = prop_kv_rule_names[k] + if k == '*': + res = self._add_rule( + f'{name}{"-" if name else ""}additional-kvs', + f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*' + ) + elif first_is_optional: + res = f'( "," space {kv_rule_name} )?' + else: + res = kv_rule_name + if len(rest) > 0: + res += ' ' + self._add_rule( + f'{name}{"-" if name else ""}{k}-rest', + get_recursive_refs(rest, first_is_optional=True) + ) + return res + + rule += ' | '.join( + get_recursive_refs(optional_props[i:], first_is_optional=False) + for i in range(len(optional_props)) + ) + if required_props: + rule += ' )' + rule += ' )?' + + rule += ' "}" space' + + return rule + def format_grammar(self): - return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items())) + return '\n'.join( + f'{name} ::= {rule}' + for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0]) + ) def main(args_in = None): @@ -115,16 +501,47 @@ def main(args_in = None): type=lambda s: s.split(','), help=''' comma-separated property names defining the order of precedence for object properties; - properties not specified here are given lower precedence than those that are, and are - sorted alphabetically + properties not specified here are given lower precedence than those that are, and + are kept in their original order from the schema. Required properties are always + given precedence over optional properties. ''' ) + parser.add_argument( + '--allow-fetch', + action='store_true', + default=False, + help='Whether to allow fetching referenced schemas over HTTPS') + parser.add_argument( + '--dotall', + action='store_true', + default=False, + help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns') + parser.add_argument( + '--raw-pattern', + action='store_true', + default=False, + help='Treats string patterns as raw patterns w/o quotes (or quote escapes)') + parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)') args = parser.parse_args(args_in) - schema = json.load(sys.stdin if args.schema == '-' else open(args.schema)) - prop_order = {name: idx for idx, name in enumerate(args.prop_order)} - converter = SchemaConverter(prop_order) + if args.schema.startswith('https://'): + url = args.schema + import requests + schema = requests.get(url).json() + elif args.schema == '-': + url = 'stdin' + schema = json.load(sys.stdin) + else: + url = f'file://{args.schema}' + with open(args.schema) as f: + schema = json.load(f) + converter = SchemaConverter( + prop_order={name: idx for idx, name in enumerate(args.prop_order)}, + allow_fetch=args.allow_fetch, + dotall=args.dotall, + raw_pattern=args.raw_pattern) + schema = converter.resolve_refs(schema, url) converter.visit(schema, '') print(converter.format_grammar()) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index d02824bfa..10f37b441 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -23,19 +23,22 @@ usage: ./llama-bench [options] options: -h, --help - -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -b, --batch-size (default: 512) - --memory-f32 <0|1> (default: 0) - -t, --threads (default: 16) - -ngl N, --n-gpu-layers (default: 99) - -mg i, --main-gpu (default: 0) - -mmq, --mul-mat-q <0|1> (default: 1) - -ts, --tensor_split - -r, --repetitions (default: 5) - -o, --output (default: md) - -v, --verbose (default: 0) + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -b, --batch-size (default: 512) + -ctk , --cache-type-k (default: f16) + -ctv , --cache-type-v (default: f16) + -t, --threads (default: 112) + -ngl, --n-gpu-layers (default: 99) + -sm, --split-mode (default: layer) + -mg, --main-gpu (default: 0) + -nkvo, --no-kv-offload <0|1> (default: 0) + -mmp, --mmap <0|1> (default: 1) + -ts, --tensor_split (default: 0) + -r, --repetitions (default: 5) + -o, --output (default: md) + -v, --verbose (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. ``` @@ -51,6 +54,10 @@ Each test is repeated the number of times given by `-r`, and the results are ave For a description of the other options, see the [main example](../main/README.md). +Note: + +- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`. + ## Examples ### Text generation with different models diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 7f7186cde..27e113203 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,7 @@ #include "llama.h" #include "common.h" #include "ggml-cuda.h" +#include "ggml-sycl.h" // utils static uint64_t get_time_ns() { @@ -102,6 +104,7 @@ static std::string get_cpu_info() { } } } + fclose(f); } #endif // TODO: other platforms @@ -110,11 +113,22 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::string id; -#ifdef GGML_USE_CUBLAS - int count = ggml_cuda_get_device_count(); +#ifdef GGML_USE_CUDA + int count = ggml_backend_cuda_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; - ggml_cuda_get_device_description(i, buf, sizeof(buf)); + ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { + id += "/"; + } + } +#endif +#ifdef GGML_USE_SYCL + int count = ggml_backend_sycl_get_device_count(); + for (int i = 0; i < count; i++) { + char buf[128]; + ggml_sycl_get_device_description(i, buf, sizeof(buf)); id += buf; if (i < count - 1) { id += "/"; @@ -128,19 +142,41 @@ static std::string get_gpu_info() { // command line params enum output_formats {CSV, JSON, MARKDOWN, SQL}; +static const char * output_format_str(output_formats format) { + switch (format) { + case CSV: return "csv"; + case JSON: return "json"; + case MARKDOWN: return "md"; + case SQL: return "sql"; + default: GGML_ASSERT(!"invalid output format"); + } +} + +static const char * split_mode_str(llama_split_mode mode) { + switch (mode) { + case LLAMA_SPLIT_MODE_NONE: return "none"; + case LLAMA_SPLIT_MODE_LAYER: return "layer"; + case LLAMA_SPLIT_MODE_ROW: return "row"; + default: GGML_ASSERT(!"invalid split mode"); + } +} + struct cmd_params { std::vector model; std::vector n_prompt; std::vector n_gen; std::vector n_batch; + std::vector n_ubatch; std::vector type_k; std::vector type_v; std::vector n_threads; std::vector n_gpu_layers; + std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; - std::vector mul_mat_q; - std::vector> tensor_split; + std::vector> tensor_split; + std::vector use_mmap; + std::vector embeddings; int reps; bool verbose; output_formats output_format; @@ -150,15 +186,18 @@ static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, /* n_gen */ {128}, - /* n_batch */ {512}, + /* n_batch */ {2048}, + /* n_ubatch */ {512}, /* type_k */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, - /* mul_mat_q */ {true}, - /* tensor_split */ {{}}, + /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, + /* use_mmap */ {true}, + /* embeddings */ {false}, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN @@ -169,21 +208,24 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); - printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); - printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); - printf(" -ts, --tensor_split \n"); - printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); - printf(" -o, --output (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); - printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub N, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -ts, --tensor-split (default: 0)\n"); + printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); + printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); + printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); } @@ -207,6 +249,9 @@ static ggml_type ggml_type_from_name(const std::string & s) { if (s == "q5_1") { return GGML_TYPE_Q5_1; } + if (s == "iq4_nl") { + return GGML_TYPE_IQ4_NL; + } return GGML_TYPE_COUNT; } @@ -260,6 +305,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); + } else if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); } else if (arg == "-ctk" || arg == "--cache-type-k") { if (++i >= argc) { invalid_param = true; @@ -306,6 +358,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-sm" || arg == "--split-mode") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + std::vector modes; + for (const auto & m : p) { + llama_split_mode mode; + if (m == "none") { + mode = LLAMA_SPLIT_MODE_NONE; + } else if (m == "layer") { + mode = LLAMA_SPLIT_MODE_LAYER; + } else if (m == "row") { + mode = LLAMA_SPLIT_MODE_ROW; + } else { + invalid_param = true; + break; + } + modes.push_back(mode); + } + params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); } else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { invalid_param = true; @@ -319,13 +393,20 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); - } else if (arg == "-mmq" || arg == "--mul-mat-q") { + } else if (arg == "-mmp" || arg == "--mmap") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); - params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end()); + params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); + } else if (arg == "-embd" || arg == "--embeddings") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; @@ -336,10 +417,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { const std::regex regex{R"([;/]+)"}; std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + GGML_ASSERT(split_arg.size() <= llama_max_devices()); - std::array tensor_split; - for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { + std::vector tensor_split(llama_max_devices()); + for (size_t i = 0; i < llama_max_devices(); ++i) { if (i < split_arg.size()) { tensor_split[i] = std::stof(split_arg[i]); } else { @@ -389,13 +470,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; } if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } + if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } + if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } + if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } return params; @@ -406,21 +490,26 @@ struct cmd_params_instance { int n_prompt; int n_gen; int n_batch; + int n_ubatch; ggml_type type_k; ggml_type type_v; int n_threads; int n_gpu_layers; + llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool mul_mat_q; - std::array tensor_split; + std::vector tensor_split; + bool use_mmap; + bool embeddings; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; + mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); + mparams.use_mmap = use_mmap; return mparams; } @@ -428,7 +517,9 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && + split_mode == other.split_mode && main_gpu == other.main_gpu && + use_mmap == other.use_mmap && tensor_split == other.tensor_split; } @@ -437,60 +528,31 @@ struct cmd_params_instance { cparams.n_ctx = n_prompt + n_gen; cparams.n_batch = n_batch; + cparams.n_ubatch = n_ubatch; cparams.type_k = type_k; cparams.type_v = type_v; - cparams.mul_mat_q = mul_mat_q; cparams.offload_kqv = !no_kv_offload; + cparams.embeddings = embeddings; return cparams; } }; -static std::vector get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) { - std::vector instances; - - for (const auto & m : params.model) - for (const auto & nl : params.n_gpu_layers) - for (const auto & mg : params.main_gpu) - for (const auto & ts : params.tensor_split) - for (const auto & nb : params.n_batch) - for (const auto & tk : params.type_k) - for (const auto & tv : params.type_v) - for (const auto & mmq : params.mul_mat_q) - for (const auto & nkvo : params.no_kv_offload) - for (const auto & nt : params.n_threads) { - cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ n_prompt, - /* .n_gen = */ n_gen, - /* .n_batch = */ nb, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .n_gpu_layers = */ nl, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, - /* .tensor_split = */ ts, - }; - instances.push_back(instance); - } - return instances; -} - static std::vector get_cmd_params_instances(const cmd_params & params) { std::vector instances; -#if 1 // this ordering minimizes the number of times that each model needs to be reloaded for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) for (const auto & ts : params.tensor_split) + for (const auto & mmp : params.use_mmap) + for (const auto & embd : params.embeddings) for (const auto & nb : params.n_batch) + for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) - for (const auto & mmq : params.mul_mat_q) for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { @@ -502,14 +564,17 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_prompt = */ n_prompt, /* .n_gen = */ 0, /* .n_batch = */ nb, + /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, + /* .use_mmap = */ mmp, + /* .embeddings = */ embd, }; instances.push_back(instance); } @@ -523,36 +588,21 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_prompt = */ 0, /* .n_gen = */ n_gen, /* .n_batch = */ nb, + /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, + /* .use_mmap = */ mmp, + /* .embeddings = */ embd, }; instances.push_back(instance); } } -#else - // this ordering separates the prompt and generation tests - for (const auto & n_prompt : params.n_prompt) { - if (n_prompt == 0) { - continue; - } - auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt); - instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end()); - } - - for (const auto & n_gen : params.n_gen) { - if (n_gen == 0) { - continue; - } - auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0); - instances.insert(instances.end(), instances_gen.begin(), instances_gen.end()); - } -#endif return instances; } @@ -562,7 +612,10 @@ struct test { static const int build_number; static const bool cuda; static const bool opencl; + static const bool vulkan; + static const bool kompute; static const bool metal; + static const bool sycl; static const bool gpu_blas; static const bool blas; static const std::string cpu_info; @@ -572,14 +625,17 @@ struct test { uint64_t model_size; uint64_t model_n_params; int n_batch; + int n_ubatch; int n_threads; ggml_type type_k; ggml_type type_v; int n_gpu_layers; + llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool mul_mat_q; - std::array tensor_split; + std::vector tensor_split; + bool use_mmap; + bool embeddings; int n_prompt; int n_gen; std::string test_time; @@ -593,14 +649,17 @@ struct test { model_size = llama_model_size(lmodel); model_n_params = llama_model_n_params(lmodel); n_batch = inst.n_batch; + n_ubatch = inst.n_ubatch; n_threads = inst.n_threads; type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; - mul_mat_q = inst.mul_mat_q; tensor_split = inst.tensor_split; + use_mmap = inst.use_mmap; + embeddings = inst.embeddings; n_prompt = inst.n_prompt; n_gen = inst.n_gen; // RFC 3339 date-time format @@ -641,27 +700,39 @@ struct test { if (opencl) { return "OpenCL"; } + if (vulkan) { + return "Vulkan"; + } + if (kompute) { + return "Kompute"; + } if (metal) { return "Metal"; } + if (sycl) { + return GGML_SYCL_NAME; + } if (gpu_blas) { return "GPU BLAS"; } if (blas) { return "BLAS"; } + return "CPU"; } static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", - "cuda", "opencl", "metal", "gpu_blas", "blas", + "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", - "n_batch", "n_threads", "type_k", "type_v", - "n_gpu_layers", "main_gpu", "no_kv_offload", - "mul_mat_q", "tensor_split", + "n_batch", "n_ubatch", + "n_threads", "type_k", "type_v", + "n_gpu_layers", "split_mode", + "main_gpu", "no_kv_offload", + "tensor_split", "use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -672,15 +743,17 @@ struct test { enum field_type {STRING, BOOL, INT, FLOAT}; static field_type get_field_type(const std::string & field) { - if (field == "build_number" || field == "n_batch" || field == "n_threads" || + if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || + field == "n_threads" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || field == "stddev_ns") { return INT; } - if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { + if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || + field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || + field == "use_mmap" || field == "embeddings") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -692,7 +765,7 @@ struct test { std::vector get_values() const { std::string tensor_split_str; int max_nonzero = 0; - for (int i = 0; i < LLAMA_MAX_DEVICES; i++) { + for (size_t i = 0; i < llama_max_devices(); i++) { if (tensor_split[i] > 0) { max_nonzero = i; } @@ -707,12 +780,15 @@ struct test { } std::vector values = { build_commit, std::to_string(build_number), - std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), + std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), + std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), - std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload), - std::to_string(mul_mat_q), tensor_split_str, + std::to_string(n_batch), std::to_string(n_ubatch), + std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), + std::to_string(n_gpu_layers), split_mode_str(split_mode), + std::to_string(main_gpu), std::to_string(no_kv_offload), + tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -732,11 +808,14 @@ struct test { const std::string test::build_commit = LLAMA_COMMIT; const int test::build_number = LLAMA_BUILD_NUMBER; -const bool test::cuda = !!ggml_cpu_has_cublas(); +const bool test::cuda = !!ggml_cpu_has_cuda(); const bool test::opencl = !!ggml_cpu_has_clblast(); +const bool test::vulkan = !!ggml_cpu_has_vulkan(); +const bool test::kompute = !!ggml_cpu_has_kompute(); const bool test::metal = !!ggml_cpu_has_metal(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::blas = !!ggml_cpu_has_blas(); +const bool test::sycl = !!ggml_cpu_has_sycl(); const std::string test::cpu_info = get_cpu_info(); const std::string test::gpu_info = get_gpu_info(); @@ -867,15 +946,21 @@ struct markdown_printer : public printer { if (field == "n_gpu_layers") { return "ngl"; } + if (field == "split_mode") { + return "sm"; + } if (field == "n_threads") { return "threads"; } - if (field == "mul_mat_q") { - return "mmq"; - } if (field == "no_kv_offload") { return "nkvo"; } + if (field == "use_mmap") { + return "mmap"; + } + if (field == "embeddings") { + return "embd"; + } if (field == "tensor_split") { return "ts"; } @@ -884,40 +969,49 @@ struct markdown_printer : public printer { void print_header(const cmd_params & params) override { // select fields to print - fields.push_back("model"); - fields.push_back("size"); - fields.push_back("params"); - fields.push_back("backend"); + fields.emplace_back("model"); + fields.emplace_back("size"); + fields.emplace_back("params"); + fields.emplace_back("backend"); bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; if (!is_cpu_backend) { - fields.push_back("n_gpu_layers"); + fields.emplace_back("n_gpu_layers"); } if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { - fields.push_back("n_threads"); + fields.emplace_back("n_threads"); } if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { - fields.push_back("n_batch"); + fields.emplace_back("n_batch"); + } + if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) { + fields.emplace_back("n_ubatch"); } if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) { - fields.push_back("type_k"); + fields.emplace_back("type_k"); } if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { - fields.push_back("type_v"); + fields.emplace_back("type_v"); } if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { - fields.push_back("main_gpu"); + fields.emplace_back("main_gpu"); } - if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { - fields.push_back("mul_mat_q"); + if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { + fields.emplace_back("split_mode"); } if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { - fields.push_back("no_kv_offload"); + fields.emplace_back("no_kv_offload"); } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { - fields.push_back("tensor_split"); + fields.emplace_back("tensor_split"); } - fields.push_back("test"); - fields.push_back("t/s"); + if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { + fields.emplace_back("use_mmap"); + } + if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { + fields.emplace_back("embeddings"); + } + fields.emplace_back("test"); + fields.emplace_back("t/s"); fprintf(fout, "|"); for (const auto & field : fields) { @@ -1031,25 +1125,40 @@ struct sql_printer : public printer { }; static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { - std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx))); - int n_processed = 0; - llama_set_n_threads(ctx, n_threads, n_threads); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + std::vector tokens(n_batch); + + int n_processed = 0; + while (n_processed < n_prompt) { int n_tokens = std::min(n_prompt - n_processed, n_batch); + tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + for (int i = 1; i < n_tokens; i++) { + tokens[i] = std::rand() % n_vocab; + } llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0)); n_processed += n_tokens; } + + llama_synchronize(ctx); } static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { - llama_token token = llama_token_bos(llama_get_model(ctx)); - llama_set_n_threads(ctx, n_threads, n_threads); + const llama_model * model = llama_get_model(ctx); + const int32_t n_vocab = llama_n_vocab(model); + + llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab; + for (int i = 0; i < n_gen; i++) { llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0)); + llama_synchronize(ctx); + token = std::rand() % n_vocab; } } @@ -1081,8 +1190,7 @@ int main(int argc, char ** argv) { if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); } - bool numa = false; - llama_backend_init(numa); + llama_backend_init(); // initialize printer std::unique_ptr p; @@ -1139,7 +1247,8 @@ int main(int argc, char ** argv) { // warmup run if (t.n_prompt > 0) { - test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads); + //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads); + test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads); } if (t.n_gen > 0) { test_gen(ctx, 1, 0, t.n_threads); @@ -1155,6 +1264,7 @@ int main(int argc, char ** argv) { if (t.n_gen > 0) { test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads); } + uint64_t t_ns = get_time_ns() - t_start; t.samples_ns.push_back(t_ns); } diff --git a/examples/llama.android/.gitignore b/examples/llama.android/.gitignore new file mode 100644 index 000000000..347e252ef --- /dev/null +++ b/examples/llama.android/.gitignore @@ -0,0 +1,33 @@ +# Gradle files +.gradle/ +build/ + +# Local configuration file (sdk path, etc) +local.properties + +# Log/OS Files +*.log + +# Android Studio generated files and folders +captures/ +.externalNativeBuild/ +.cxx/ +*.apk +output.json + +# IntelliJ +*.iml +.idea/ +misc.xml +deploymentTargetDropDown.xml +render.experimental.xml + +# Keystore files +*.jks +*.keystore + +# Google Services (e.g. APIs or Firebase) +google-services.json + +# Android Profiling +*.hprof diff --git a/examples/llama.android/README.md b/examples/llama.android/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/examples/llama.android/app/.gitignore b/examples/llama.android/app/.gitignore new file mode 100644 index 000000000..796b96d1c --- /dev/null +++ b/examples/llama.android/app/.gitignore @@ -0,0 +1 @@ +/build diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts new file mode 100644 index 000000000..d42140efe --- /dev/null +++ b/examples/llama.android/app/build.gradle.kts @@ -0,0 +1,88 @@ +plugins { + id("com.android.application") + id("org.jetbrains.kotlin.android") +} + +android { + namespace = "com.example.llama" + compileSdk = 34 + + ndkVersion = "26.1.10909125" + + defaultConfig { + applicationId = "com.example.llama" + minSdk = 33 + targetSdk = 34 + versionCode = 1 + versionName = "1.0" + + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + vectorDrawables { + useSupportLibrary = true + } + ndk { + // Add NDK properties if wanted, e.g. + // abiFilters += listOf("arm64-v8a") + } + externalNativeBuild { + cmake { + arguments += "-DCMAKE_BUILD_TYPE=Release" + cppFlags += listOf() + arguments += listOf() + } + } + } + + buildTypes { + release { + isMinifyEnabled = false + proguardFiles( + getDefaultProguardFile("proguard-android-optimize.txt"), + "proguard-rules.pro" + ) + } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } + kotlinOptions { + jvmTarget = "1.8" + } + buildFeatures { + compose = true + } + composeOptions { + kotlinCompilerExtensionVersion = "1.5.1" + } + packaging { + resources { + excludes += "/META-INF/{AL2.0,LGPL2.1}" + } + } + externalNativeBuild { + cmake { + path = file("src/main/cpp/CMakeLists.txt") + version = "3.22.1" + } + } +} + +dependencies { + + implementation("androidx.core:core-ktx:1.12.0") + implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2") + implementation("androidx.activity:activity-compose:1.8.2") + implementation(platform("androidx.compose:compose-bom:2023.08.00")) + implementation("androidx.compose.ui:ui") + implementation("androidx.compose.ui:ui-graphics") + implementation("androidx.compose.ui:ui-tooling-preview") + implementation("androidx.compose.material3:material3") + testImplementation("junit:junit:4.13.2") + androidTestImplementation("androidx.test.ext:junit:1.1.5") + androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") + androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00")) + androidTestImplementation("androidx.compose.ui:ui-test-junit4") + debugImplementation("androidx.compose.ui:ui-tooling") + debugImplementation("androidx.compose.ui:ui-test-manifest") +} diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro new file mode 100644 index 000000000..f1b424510 --- /dev/null +++ b/examples/llama.android/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml new file mode 100644 index 000000000..41a358a29 --- /dev/null +++ b/examples/llama.android/app/src/main/AndroidManifest.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/app/src/main/cpp/CMakeLists.txt new file mode 100644 index 000000000..85139329a --- /dev/null +++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt @@ -0,0 +1,50 @@ + +# For more information about using CMake with Android Studio, read the +# documentation: https://d.android.com/studio/projects/add-native-code.html. +# For more examples on how to use CMake, see https://github.com/android/ndk-samples. + +# Sets the minimum CMake version required for this project. +cmake_minimum_required(VERSION 3.22.1) + +# Declares the project name. The project name can be accessed via ${ PROJECT_NAME}, +# Since this is the top level CMakeLists.txt, the project name is also accessible +# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level +# build script scope). +project("llama-android") + +include(FetchContent) +FetchContent_Declare( + llama + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp + GIT_TAG master +) + +# Also provides "common" +FetchContent_MakeAvailable(llama) + +# Creates and names a library, sets it as either STATIC +# or SHARED, and provides the relative paths to its source code. +# You can define multiple libraries, and CMake builds them for you. +# Gradle automatically packages shared libraries with your APK. +# +# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define +# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME} +# is preferred for the same purpose. +# +# In order to load a library into your app from Java/Kotlin, you must call +# System.loadLibrary() and pass the name of the library defined here; +# for GameActivity/NativeActivity derived applications, the same library name must be +# used in the AndroidManifest.xml file. +add_library(${CMAKE_PROJECT_NAME} SHARED + # List C/C++ source files with relative paths to this CMakeLists.txt. + llama-android.cpp) + +# Specifies libraries CMake should link to your target library. You +# can link libraries from various origins, such as libraries defined in this +# build script, prebuilt third-party libraries, or Android system libraries. +target_link_libraries(${CMAKE_PROJECT_NAME} + # List libraries link to the target library + llama + common + android + log) diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp new file mode 100644 index 000000000..ce8ab3b70 --- /dev/null +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -0,0 +1,443 @@ +#include +#include +#include +#include +#include +#include +#include "llama.h" +#include "common/common.h" + +// Write C++ code here. +// +// Do not forget to dynamically load the C++ library into your application. +// +// For instance, +// +// In MainActivity.java: +// static { +// System.loadLibrary("llama-android"); +// } +// +// Or, in MainActivity.kt: +// companion object { +// init { +// System.loadLibrary("llama-android") +// } +// } + +#define TAG "llama-android.cpp" +#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__) +#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__) + +jclass la_int_var; +jmethodID la_int_var_value; +jmethodID la_int_var_inc; + +std::string cached_token_chars; + +bool is_valid_utf8(const char * string) { + if (!string) { + return true; + } + + const unsigned char * bytes = (const unsigned char *)string; + int num; + + while (*bytes != 0x00) { + if ((*bytes & 0x80) == 0x00) { + // U+0000 to U+007F + num = 1; + } else if ((*bytes & 0xE0) == 0xC0) { + // U+0080 to U+07FF + num = 2; + } else if ((*bytes & 0xF0) == 0xE0) { + // U+0800 to U+FFFF + num = 3; + } else if ((*bytes & 0xF8) == 0xF0) { + // U+10000 to U+10FFFF + num = 4; + } else { + return false; + } + + bytes += 1; + for (int i = 1; i < num; ++i) { + if ((*bytes & 0xC0) != 0x80) { + return false; + } + bytes += 1; + } + } + + return true; +} + +static void log_callback(ggml_log_level level, const char * fmt, void * data) { + if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data); + else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data); + else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data); + else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) { + llama_model_params model_params = llama_model_default_params(); + + auto path_to_model = env->GetStringUTFChars(filename, 0); + LOGi("Loading model from %s", path_to_model); + + auto model = llama_load_model_from_file(path_to_model, model_params); + env->ReleaseStringUTFChars(filename, path_to_model); + + if (!model) { + LOGe("load_model() failed"); + env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed"); + return 0; + } + + return reinterpret_cast(model); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) { + llama_free_model(reinterpret_cast(model)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) { + auto model = reinterpret_cast(jmodel); + + if (!model) { + LOGe("new_context(): model cannot be null"); + env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null"); + return 0; + } + + int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2)); + LOGi("Using %d threads", n_threads); + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.seed = 1234; + ctx_params.n_ctx = 2048; + ctx_params.n_threads = n_threads; + ctx_params.n_threads_batch = n_threads; + + llama_context * context = llama_new_context_with_model(model, ctx_params); + + if (!context) { + LOGe("llama_new_context_with_model() returned null)"); + env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), + "llama_new_context_with_model() returned null)"); + return 0; + } + + return reinterpret_cast(context); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) { + llama_free(reinterpret_cast(context)); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) { + llama_backend_free(); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) { + llama_log_set(log_callback, NULL); +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_bench_1model( + JNIEnv *env, + jobject, + jlong context_pointer, + jlong model_pointer, + jlong batch_pointer, + jint pp, + jint tg, + jint pl, + jint nr + ) { + auto pp_avg = 0.0; + auto tg_avg = 0.0; + auto pp_std = 0.0; + auto tg_std = 0.0; + + const auto context = reinterpret_cast(context_pointer); + const auto model = reinterpret_cast(model_pointer); + const auto batch = reinterpret_cast(batch_pointer); + + const int n_ctx = llama_n_ctx(context); + + LOGi("n_ctx = %d", n_ctx); + + int i, j; + int nri; + for (nri = 0; nri < nr; nri++) { + LOGi("Benchmark prompt processing (pp)"); + + llama_batch_clear(*batch); + + const int n_tokens = pp; + for (i = 0; i < n_tokens; i++) { + llama_batch_add(*batch, 0, i, { 0 }, false); + } + + batch->logits[batch->n_tokens - 1] = true; + llama_kv_cache_clear(context); + + const auto t_pp_start = ggml_time_us(); + if (llama_decode(context, *batch) != 0) { + LOGi("llama_decode() failed during prompt processing"); + } + const auto t_pp_end = ggml_time_us(); + + // bench text generation + + LOGi("Benchmark text generation (tg)"); + + llama_kv_cache_clear(context); + const auto t_tg_start = ggml_time_us(); + for (i = 0; i < tg; i++) { + + llama_batch_clear(*batch); + for (j = 0; j < pl; j++) { + llama_batch_add(*batch, 0, i, { j }, true); + } + + LOGi("llama_decode() text generation: %d", i); + if (llama_decode(context, *batch) != 0) { + LOGi("llama_decode() failed during text generation"); + } + } + + const auto t_tg_end = ggml_time_us(); + + llama_kv_cache_clear(context); + + const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0; + const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0; + + const auto speed_pp = double(pp) / t_pp; + const auto speed_tg = double(pl * tg) / t_tg; + + pp_avg += speed_pp; + tg_avg += speed_tg; + + pp_std += speed_pp * speed_pp; + tg_std += speed_tg * speed_tg; + + LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg); + } + + pp_avg /= double(nr); + tg_avg /= double(nr); + + if (nr > 1) { + pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1)); + tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1)); + } else { + pp_std = 0; + tg_std = 0; + } + + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + + const auto model_size = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0; + const auto model_n_params = double(llama_model_n_params(model)) / 1e9; + + const auto backend = "(Android)"; // TODO: What should this be? + + std::stringstream result; + result << std::setprecision(2); + result << "| model | size | params | backend | test | t/s |\n"; + result << "| --- | --- | --- | --- | --- | --- |\n"; + result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n"; + result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n"; + + return env->NewStringUTF(result.str().c_str()); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { + llama_batch_free(*reinterpret_cast(batch_pointer)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { + + // Source: Copy of llama.cpp:llama_batch_init but heap-allocated. + + llama_batch *batch = new llama_batch { + 0, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + 0, + 0, + 0, + }; + + if (embd) { + batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd); + } else { + batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens); + } + + batch->pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens); + batch->n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens); + batch->seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens); + for (int i = 0; i < n_tokens; ++i) { + batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max); + } + batch->logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens); + + return reinterpret_cast(batch); +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) { + llama_backend_init(); +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) { + return env->NewStringUTF(llama_print_system_info()); +} + +extern "C" +JNIEXPORT jint JNICALL +Java_com_example_llama_Llm_completion_1init( + JNIEnv *env, + jobject, + jlong context_pointer, + jlong batch_pointer, + jstring jtext, + jint n_len + ) { + + cached_token_chars.clear(); + + const auto text = env->GetStringUTFChars(jtext, 0); + const auto context = reinterpret_cast(context_pointer); + const auto batch = reinterpret_cast(batch_pointer); + + const auto tokens_list = llama_tokenize(context, text, 1); + + auto n_ctx = llama_n_ctx(context); + auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); + + LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req); + + if (n_kv_req > n_ctx) { + LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough"); + } + + for (auto id : tokens_list) { + LOGi("%s", llama_token_to_piece(context, id).c_str()); + } + + llama_batch_clear(*batch); + + // evaluate the initial prompt + for (auto i = 0; i < tokens_list.size(); i++) { + llama_batch_add(*batch, tokens_list[i], i, { 0 }, false); + } + + // llama_decode will output logits only for the last token of the prompt + batch->logits[batch->n_tokens - 1] = true; + + if (llama_decode(context, *batch) != 0) { + LOGe("llama_decode() failed"); + } + + env->ReleaseStringUTFChars(jtext, text); + + return batch->n_tokens; +} + +extern "C" +JNIEXPORT jstring JNICALL +Java_com_example_llama_Llm_completion_1loop( + JNIEnv * env, + jobject, + jlong context_pointer, + jlong batch_pointer, + jint n_len, + jobject intvar_ncur +) { + const auto context = reinterpret_cast(context_pointer); + const auto batch = reinterpret_cast(batch_pointer); + const auto model = llama_get_model(context); + + if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); + if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I"); + if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); + + auto n_vocab = llama_n_vocab(model); + auto logits = llama_get_logits_ith(context, batch->n_tokens - 1); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f }); + } + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + // sample the most likely token + const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); + + const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); + if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + return env->NewStringUTF(""); + } + + auto new_token_chars = llama_token_to_piece(context, new_token_id); + cached_token_chars += new_token_chars; + + jstring new_token = nullptr; + if (is_valid_utf8(cached_token_chars.c_str())) { + new_token = env->NewStringUTF(cached_token_chars.c_str()); + LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id); + cached_token_chars.clear(); + } else { + new_token = env->NewStringUTF(""); + } + + llama_batch_clear(*batch); + llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); + + env->CallVoidMethod(intvar_ncur, la_int_var_inc); + + if (llama_decode(context, *batch) != 0) { + LOGe("llama_decode() returned null"); + } + + return new_token; +} + +extern "C" +JNIEXPORT void JNICALL +Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { + llama_kv_cache_clear(reinterpret_cast(context)); +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt new file mode 100644 index 000000000..78c231ae5 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt @@ -0,0 +1,119 @@ +package com.example.llama + +import android.app.DownloadManager +import android.net.Uri +import android.util.Log +import androidx.compose.material3.Button +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableDoubleStateOf +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.rememberCoroutineScope +import androidx.compose.runtime.setValue +import androidx.core.database.getLongOrNull +import androidx.core.net.toUri +import kotlinx.coroutines.delay +import kotlinx.coroutines.launch +import java.io.File + +data class Downloadable(val name: String, val source: Uri, val destination: File) { + companion object { + @JvmStatic + private val tag: String? = this::class.qualifiedName + + sealed interface State + data object Ready: State + data class Downloading(val id: Long): State + data class Downloaded(val downloadable: Downloadable): State + data class Error(val message: String): State + + @JvmStatic + @Composable + fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) { + var status: State by remember { + mutableStateOf( + if (item.destination.exists()) Downloaded(item) + else Ready + ) + } + var progress by remember { mutableDoubleStateOf(0.0) } + + val coroutineScope = rememberCoroutineScope() + + suspend fun waitForDownload(result: Downloading, item: Downloadable): State { + while (true) { + val cursor = dm.query(DownloadManager.Query().setFilterById(result.id)) + + if (cursor == null) { + Log.e(tag, "dm.query() returned null") + return Error("dm.query() returned null") + } + + if (!cursor.moveToFirst() || cursor.count < 1) { + cursor.close() + Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?") + return Ready + } + + val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR) + val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES) + val sofar = cursor.getLongOrNull(pix) ?: 0 + val total = cursor.getLongOrNull(tix) ?: 1 + cursor.close() + + if (sofar == total) { + return Downloaded(item) + } + + progress = (sofar * 1.0) / total + + delay(1000L) + } + } + + fun onClick() { + when (val s = status) { + is Downloaded -> { + viewModel.load(item.destination.path) + } + + is Downloading -> { + coroutineScope.launch { + status = waitForDownload(s, item) + } + } + + else -> { + item.destination.delete() + + val request = DownloadManager.Request(item.source).apply { + setTitle("Downloading model") + setDescription("Downloading model: ${item.name}") + setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI) + setDestinationUri(item.destination.toUri()) + } + + viewModel.log("Saving ${item.name} to ${item.destination.path}") + Log.i(tag, "Saving ${item.name} to ${item.destination.path}") + + val id = dm.enqueue(request) + status = Downloading(id) + onClick() + } + } + } + + Button(onClick = { onClick() }, enabled = status !is Downloading) { + when (status) { + is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%") + is Downloaded -> Text("Load ${item.name}") + is Ready -> Text("Download ${item.name}") + is Error -> Text("Download ${item.name}") + } + } + } + + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt new file mode 100644 index 000000000..d86afee37 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt @@ -0,0 +1,172 @@ +package com.example.llama + +import android.util.Log +import kotlinx.coroutines.CoroutineDispatcher +import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.flow.Flow +import kotlinx.coroutines.flow.flow +import kotlinx.coroutines.flow.flowOn +import kotlinx.coroutines.withContext +import java.util.concurrent.Executors +import kotlin.concurrent.thread + +class Llm { + private val tag: String? = this::class.simpleName + + private val threadLocalState: ThreadLocal = ThreadLocal.withInitial { State.Idle } + + private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor { + thread(start = false, name = "Llm-RunLoop") { + Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}") + + // No-op if called more than once. + System.loadLibrary("llama-android") + + // Set llama log handler to Android + log_to_android() + backend_init(false) + + Log.d(tag, system_info()) + + it.run() + }.apply { + uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable -> + Log.e(tag, "Unhandled exception", exception) + } + } + }.asCoroutineDispatcher() + + private val nlen: Int = 64 + + private external fun log_to_android() + private external fun load_model(filename: String): Long + private external fun free_model(model: Long) + private external fun new_context(model: Long): Long + private external fun free_context(context: Long) + private external fun backend_init(numa: Boolean) + private external fun backend_free() + private external fun free_batch(batch: Long) + private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long + private external fun bench_model( + context: Long, + model: Long, + batch: Long, + pp: Int, + tg: Int, + pl: Int, + nr: Int + ): String + + private external fun system_info(): String + + private external fun completion_init( + context: Long, + batch: Long, + text: String, + nLen: Int + ): Int + + private external fun completion_loop( + context: Long, + batch: Long, + nLen: Int, + ncur: IntVar + ): String? + + private external fun kv_cache_clear(context: Long) + + suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String { + return withContext(runLoop) { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + Log.d(tag, "bench(): $state") + bench_model(state.context, state.model, state.batch, pp, tg, pl, nr) + } + + else -> throw IllegalStateException("No model loaded") + } + } + } + + suspend fun load(pathToModel: String) { + withContext(runLoop) { + when (threadLocalState.get()) { + is State.Idle -> { + val model = load_model(pathToModel) + if (model == 0L) throw IllegalStateException("load_model() failed") + + val context = new_context(model) + if (context == 0L) throw IllegalStateException("new_context() failed") + + val batch = new_batch(512, 0, 1) + if (batch == 0L) throw IllegalStateException("new_batch() failed") + + Log.i(tag, "Loaded model $pathToModel") + threadLocalState.set(State.Loaded(model, context, batch)) + } + else -> throw IllegalStateException("Model already loaded") + } + } + } + + fun send(message: String): Flow = flow { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) + while (ncur.value <= nlen) { + val str = completion_loop(state.context, state.batch, nlen, ncur) + if (str == null) { + break + } + emit(str) + } + kv_cache_clear(state.context) + } + else -> {} + } + }.flowOn(runLoop) + + /** + * Unloads the model and frees resources. + * + * This is a no-op if there's no model loaded. + */ + suspend fun unload() { + withContext(runLoop) { + when (val state = threadLocalState.get()) { + is State.Loaded -> { + free_context(state.context) + free_model(state.model) + free_batch(state.batch) + + threadLocalState.set(State.Idle) + } + else -> {} + } + } + } + + companion object { + private class IntVar(value: Int) { + @Volatile + var value: Int = value + private set + + fun inc() { + synchronized(this) { + value += 1 + } + } + } + + private sealed interface State { + data object Idle: State + data class Loaded(val model: Long, val context: Long, val batch: Long): State + } + + // Enforce only one instance of Llm. + private val _instance: Llm = Llm() + + fun instance(): Llm = _instance + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt new file mode 100644 index 000000000..9da04f7d3 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt @@ -0,0 +1,154 @@ +package com.example.llama + +import android.app.ActivityManager +import android.app.DownloadManager +import android.content.ClipData +import android.content.ClipboardManager +import android.net.Uri +import android.os.Bundle +import android.os.StrictMode +import android.os.StrictMode.VmPolicy +import android.text.format.Formatter +import androidx.activity.ComponentActivity +import androidx.activity.compose.setContent +import androidx.activity.viewModels +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.lazy.LazyColumn +import androidx.compose.foundation.lazy.items +import androidx.compose.foundation.lazy.rememberLazyListState +import androidx.compose.material3.Button +import androidx.compose.material3.LocalContentColor +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.OutlinedTextField +import androidx.compose.material3.Surface +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.ui.Modifier +import androidx.compose.ui.unit.dp +import androidx.core.content.getSystemService +import com.example.llama.ui.theme.LlamaAndroidTheme +import java.io.File + +class MainActivity( + activityManager: ActivityManager? = null, + downloadManager: DownloadManager? = null, + clipboardManager: ClipboardManager? = null, +): ComponentActivity() { + private val tag: String? = this::class.simpleName + + private val activityManager by lazy { activityManager ?: getSystemService()!! } + private val downloadManager by lazy { downloadManager ?: getSystemService()!! } + private val clipboardManager by lazy { clipboardManager ?: getSystemService()!! } + + private val viewModel: MainViewModel by viewModels() + + // Get a MemoryInfo object for the device's current memory status. + private fun availableMemory(): ActivityManager.MemoryInfo { + return ActivityManager.MemoryInfo().also { memoryInfo -> + activityManager.getMemoryInfo(memoryInfo) + } + } + + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + + StrictMode.setVmPolicy( + VmPolicy.Builder(StrictMode.getVmPolicy()) + .detectLeakedClosableObjects() + .build() + ) + + val free = Formatter.formatFileSize(this, availableMemory().availMem) + val total = Formatter.formatFileSize(this, availableMemory().totalMem) + + viewModel.log("Current memory: $free / $total") + viewModel.log("Downloads directory: ${getExternalFilesDir(null)}") + + val extFilesDir = getExternalFilesDir(null) + + val models = listOf( + Downloadable( + "Phi-2 7B (Q4_0, 1.6 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"), + File(extFilesDir, "phi-2-q4_0.gguf"), + ), + Downloadable( + "TinyLlama 1.1B (f16, 2.2 GiB)", + Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), + File(extFilesDir, "tinyllama-1.1-f16.gguf"), + ), + Downloadable( + "Phi 2 DPO (Q3_K_M, 1.48 GiB)", + Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"), + File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf") + ), + ) + + setContent { + LlamaAndroidTheme { + // A surface container using the 'background' color from the theme + Surface( + modifier = Modifier.fillMaxSize(), + color = MaterialTheme.colorScheme.background + ) { + MainCompose( + viewModel, + clipboardManager, + downloadManager, + models, + ) + } + + } + } + } +} + +@Composable +fun MainCompose( + viewModel: MainViewModel, + clipboard: ClipboardManager, + dm: DownloadManager, + models: List +) { + Column { + val scrollState = rememberLazyListState() + + Box(modifier = Modifier.weight(1f)) { + LazyColumn(state = scrollState) { + items(viewModel.messages) { + Text( + it, + style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current), + modifier = Modifier.padding(16.dp) + ) + } + } + } + OutlinedTextField( + value = viewModel.message, + onValueChange = { viewModel.updateMessage(it) }, + label = { Text("Message") }, + ) + Row { + Button({ viewModel.send() }) { Text("Send") } + Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") } + Button({ viewModel.clear() }) { Text("Clear") } + Button({ + viewModel.messages.joinToString("\n").let { + clipboard.setPrimaryClip(ClipData.newPlainText("", it)) + } + }) { Text("Copy") } + } + + Column { + for (model in models) { + Downloadable.Button(viewModel, dm, model) + } + } + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt new file mode 100644 index 000000000..be95e2221 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt @@ -0,0 +1,104 @@ +package com.example.llama + +import android.util.Log +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.setValue +import androidx.lifecycle.ViewModel +import androidx.lifecycle.viewModelScope +import kotlinx.coroutines.flow.catch +import kotlinx.coroutines.launch + +class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { + companion object { + @JvmStatic + private val NanosPerSecond = 1_000_000_000.0 + } + + private val tag: String? = this::class.simpleName + + var messages by mutableStateOf(listOf("Initializing...")) + private set + + var message by mutableStateOf("") + private set + + override fun onCleared() { + super.onCleared() + + viewModelScope.launch { + try { + llm.unload() + } catch (exc: IllegalStateException) { + messages += exc.message!! + } + } + } + + fun send() { + val text = message + message = "" + + // Add to messages console. + messages += text + messages += "" + + viewModelScope.launch { + llm.send(text) + .catch { + Log.e(tag, "send() failed", it) + messages += it.message!! + } + .collect { messages = messages.dropLast(1) + (messages.last() + it) } + } + } + + fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) { + viewModelScope.launch { + try { + val start = System.nanoTime() + val warmupResult = llm.bench(pp, tg, pl, nr) + val end = System.nanoTime() + + messages += warmupResult + + val warmup = (end - start).toDouble() / NanosPerSecond + messages += "Warm up time: $warmup seconds, please wait..." + + if (warmup > 5.0) { + messages += "Warm up took too long, aborting benchmark" + return@launch + } + + messages += llm.bench(512, 128, 1, 3) + } catch (exc: IllegalStateException) { + Log.e(tag, "bench() failed", exc) + messages += exc.message!! + } + } + } + + fun load(pathToModel: String) { + viewModelScope.launch { + try { + llm.load(pathToModel) + messages += "Loaded $pathToModel" + } catch (exc: IllegalStateException) { + Log.e(tag, "load() failed", exc) + messages += exc.message!! + } + } + } + + fun updateMessage(newMessage: String) { + message = newMessage + } + + fun clear() { + messages = listOf() + } + + fun log(message: String) { + messages += message + } +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt new file mode 100644 index 000000000..40c30e8d9 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt @@ -0,0 +1,11 @@ +package com.example.llama.ui.theme + +import androidx.compose.ui.graphics.Color + +val Purple80 = Color(0xFFD0BCFF) +val PurpleGrey80 = Color(0xFFCCC2DC) +val Pink80 = Color(0xFFEFB8C8) + +val Purple40 = Color(0xFF6650a4) +val PurpleGrey40 = Color(0xFF625b71) +val Pink40 = Color(0xFF7D5260) diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt new file mode 100644 index 000000000..e742220a8 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt @@ -0,0 +1,70 @@ +package com.example.llama.ui.theme + +import android.app.Activity +import android.os.Build +import androidx.compose.foundation.isSystemInDarkTheme +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.darkColorScheme +import androidx.compose.material3.dynamicDarkColorScheme +import androidx.compose.material3.dynamicLightColorScheme +import androidx.compose.material3.lightColorScheme +import androidx.compose.runtime.Composable +import androidx.compose.runtime.SideEffect +import androidx.compose.ui.graphics.toArgb +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.platform.LocalView +import androidx.core.view.WindowCompat + +private val DarkColorScheme = darkColorScheme( + primary = Purple80, + secondary = PurpleGrey80, + tertiary = Pink80 +) + +private val LightColorScheme = lightColorScheme( + primary = Purple40, + secondary = PurpleGrey40, + tertiary = Pink40 + + /* Other default colors to override + background = Color(0xFFFFFBFE), + surface = Color(0xFFFFFBFE), + onPrimary = Color.White, + onSecondary = Color.White, + onTertiary = Color.White, + onBackground = Color(0xFF1C1B1F), + onSurface = Color(0xFF1C1B1F), + */ +) + +@Composable +fun LlamaAndroidTheme( + darkTheme: Boolean = isSystemInDarkTheme(), + // Dynamic color is available on Android 12+ + dynamicColor: Boolean = true, + content: @Composable () -> Unit +) { + val colorScheme = when { + dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { + val context = LocalContext.current + if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) + } + + darkTheme -> DarkColorScheme + else -> LightColorScheme + } + val view = LocalView.current + if (!view.isInEditMode) { + SideEffect { + val window = (view.context as Activity).window + window.statusBarColor = colorScheme.primary.toArgb() + WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme + } + } + + MaterialTheme( + colorScheme = colorScheme, + typography = Typography, + content = content + ) +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt new file mode 100644 index 000000000..0b87946ca --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt @@ -0,0 +1,34 @@ +package com.example.llama.ui.theme + +import androidx.compose.material3.Typography +import androidx.compose.ui.text.TextStyle +import androidx.compose.ui.text.font.FontFamily +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.unit.sp + +// Set of Material typography styles to start with +val Typography = Typography( + bodyLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 16.sp, + lineHeight = 24.sp, + letterSpacing = 0.5.sp + ) + /* Other default text styles to override + titleLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 22.sp, + lineHeight = 28.sp, + letterSpacing = 0.sp + ), + labelSmall = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Medium, + fontSize = 11.sp, + lineHeight = 16.sp, + letterSpacing = 0.5.sp + ) + */ +) diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 000000000..07d5da9cb --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml new file mode 100644 index 000000000..7706ab9e6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml new file mode 100644 index 000000000..b3e26b4c6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml new file mode 100644 index 000000000..b3e26b4c6 --- /dev/null +++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp new file mode 100644 index 000000000..c209e78ec Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp new file mode 100644 index 000000000..b2dfe3d1b Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp new file mode 100644 index 000000000..4f0f1d64e Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp new file mode 100644 index 000000000..62b611da0 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp new file mode 100644 index 000000000..948a3070f Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..1b9a6956b Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp new file mode 100644 index 000000000..28d4b77f9 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..9287f5083 Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp new file mode 100644 index 000000000..aa7d6427e Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp differ diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..9126ae37c Binary files /dev/null and b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp differ diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/llama.android/app/src/main/res/values/colors.xml new file mode 100644 index 000000000..ca1931bca --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/colors.xml @@ -0,0 +1,10 @@ + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml new file mode 100644 index 000000000..7a9d314e2 --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + LlamaAndroid + diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml new file mode 100644 index 000000000..8a24fda56 --- /dev/null +++ b/examples/llama.android/app/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + +