Merge branch 'master' into master

2024-03-09 02:53:27 -05:00 · 2024-03-09 02:53:27 -05:00 · a179e55256
commit a179e55256
parent a192910cb6 c2101a2e90
335 changed files with 144195 additions and 19276 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -14,7 +14,8 @@ ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git
-COPY requirements.txt requirements.txt
+COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -23,7 +23,8 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1101 \
    gfx1102
-COPY requirements.txt requirements.txt
+COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -5,7 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git
-COPY requirements.txt requirements.txt
+COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@ -0,0 +1,28 @@
 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 RUN mkdir build && \
    cd build && \
    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build . --config Release --target main
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 COPY --from=build /app/build/bin/main /main
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/main" ]
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -23,7 +23,8 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1101 \
    gfx1102
-COPY requirements.txt requirements.txt
+COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@ -0,0 +1,29 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION as build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk
 # Build it
 WORKDIR /app
 COPY . .
 RUN mkdir build && \
    cd build && \
    cmake .. -DLLAMA_VULKAN=1 && \
    cmake --build . --config Release --target main
 # Clean up
 WORKDIR /
 RUN cp /app/build/bin/main /main && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/main" ]
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -0,0 +1,22 @@
 {
  perSystem =
    { config, lib, ... }:
    {
      apps =
        let
          inherit (config.packages) default;
          binaries = [
            "llama"
            "llama-embedding"
            "llama-server"
            "quantize"
            "train-text-from-scratch"
          ];
          mkApp = name: {
            type = "app";
            program = "${default}/bin/${name}";
          };
        in
        lib.genAttrs binaries mkApp;
    };
 }
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@ -0,0 +1,13 @@
 {
  perSystem =
    { config, lib, ... }:
    {
      devShells =
        lib.concatMapAttrs
          (name: package: {
            ${name} = package.passthru.shell;
            ${name + "-extra"} = package.passthru.shell-extra;
          })
          config.packages;
    };
 }
--- a/.devops/nix/docker.nix
+++ b/.devops/nix/docker.nix
@ -0,0 +1,37 @@
 {
  lib,
  dockerTools,
  buildEnv,
  llama-cpp,
  interactive ? true,
  coreutils,
 }:
 # A tar that can be fed into `docker load`:
 #
 # $ nix build .#llamaPackages.docker
 # $ docker load < result
 # For details and variations cf.
 # - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
 # - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
 # - https://nixery.dev/
 # Approximate (compressed) sizes, at the time of writing, are:
 #
 # .#llamaPackages.docker: 125M;
 # .#llamaPackagesCuda.docker: 537M;
 # .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
 dockerTools.buildLayeredImage {
  name = llama-cpp.pname;
  tag = "latest";
  contents =
    [ llama-cpp ]
    ++ lib.optionals interactive [
      coreutils
      dockerTools.binSh
      dockerTools.caCertificates
    ];
 }
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@ -0,0 +1,39 @@
 { inputs, ... }:
 {
  perSystem =
    {
      config,
      system,
      lib,
      pkgsCuda,
      ...
    }:
    {
      legacyPackages =
        let
          caps.llamaPackagesXavier = "7.2";
          caps.llamaPackagesOrin = "8.7";
          caps.llamaPackagesTX2 = "6.2";
          caps.llamaPackagesNano = "5.3";
          pkgsFor =
            cap:
            import inputs.nixpkgs {
              inherit system;
              config = {
                cudaSupport = true;
                cudaCapabilities = [ cap ];
                cudaEnableForwardCompat = false;
                inherit (pkgsCuda.config) allowUnfreePredicate;
              };
            };
        in
        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
      packages = lib.optionalAttrs (system == "aarch64-linux") {
        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
      };
    };
 }
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@ -0,0 +1,47 @@
 { inputs, ... }:
 {
  # The _module.args definitions are passed on to modules as arguments. E.g.
  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
  # `_module.args.pkgs` (defined in this case by flake-parts).
  perSystem =
    { system, ... }:
    {
      _module.args = {
        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
        # again, the below creates several nixpkgs instances which the
        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
        #
        # This is currently "slow" and "expensive", on a certain scale.
        # This also isn't "right" in that this hinders dependency injection at
        # the level of flake inputs. This might get removed in the foreseeable
        # future.
        #
        # Note that you can use these expressions without Nix
        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
          # and ucx are built with CUDA support)
          config.cudaSupport = true;
          config.allowUnfreePredicate =
            p:
            builtins.all
              (
                license:
                license.free
                || builtins.elem license.shortName [
                  "CUDA EULA"
                  "cuDNN EULA"
                ]
              )
              (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
          inherit system;
          config.rocmSupport = true;
        };
      };
    };
 }
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -0,0 +1,297 @@
 {
  lib,
  glibc,
  config,
  stdenv,
  mkShell,
  cmake,
  ninja,
  pkg-config,
  git,
  python3,
  mpi,
  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
  cudaPackages,
  darwin,
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
  clblast,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
    useOpenCL
    useRocm
    useVulkan
  ],
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
  useMpi ? false, # Increases the runtime closure size by ~700M
  useOpenCL ? false,
  useRocm ? config.rocmSupport,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic
 }@inputs:
 let
  inherit (lib)
    cmakeBool
    cmakeFeature
    optionals
    strings
    versionOlder
    ;
  stdenv = throw "Use effectiveStdenv instead";
  suffices =
    lib.optionals useBlas [ "BLAS" ]
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ]
    ++ lib.optionals useVulkan [ "Vulkan" ];
  pnameSuffix =
    strings.optionalString (suffices != [ ])
      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
  descriptionSuffix =
    strings.optionalString (suffices != [ ])
      ", accelerated with ${strings.concatStringsSep ", " suffices}";
  # TODO: package the Python in this repository in a Nix-like way.
  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
  # https://peps.python.org/pep-0517/
  llama-python = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
    ]
  );
  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
  llama-python-extra = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
    ]
  );
  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
  # separately
  darwinBuildInputs =
    with darwin.apple_sdk.frameworks;
    [
      Accelerate
      CoreVideo
      CoreGraphics
    ]
    ++ optionals useMetalKit [ MetalKit ];
  cudaBuildInputs = with cudaPackages; [
    cuda_cccl.dev # <nv/target>
    # A temporary hack for reducing the closure size, remove once cudaPackages
    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
    cuda_cudart.dev
    cuda_cudart.lib
    cuda_cudart.static
    libcublas.dev
    libcublas.lib
    libcublas.static
  ];
  rocmBuildInputs = with rocmPackages; [
    clr
    hipblas
    rocblas
  ];
  vulkanBuildInputs = [
    vulkan-headers
    vulkan-loader
  ];
 in
 effectiveStdenv.mkDerivation (
  finalAttrs: {
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;
    # Note: none of the files discarded here are visible in the sandbox or
    # affect the output hash. This also means they can be modified without
    # triggering a rebuild.
    src = lib.cleanSourceWith {
      filter =
        name: type:
        let
          noneOf = builtins.all (x: !x);
          baseName = baseNameOf name;
        in
        noneOf [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
          (lib.hasPrefix "." baseName) # Skip hidden files and directories
          (baseName == "flake.lock")
        ];
      src = lib.cleanSource ../../.;
    };
    postPatch = ''
      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
      # TODO: Package up each Python script or service appropriately.
      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
      # we could make those *.py into setuptools' entrypoints
      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
    '';
    nativeBuildInputs =
      [
        cmake
        ninja
        pkg-config
        git
      ]
      ++ optionals useCuda [
        cudaPackages.cuda_nvcc
        # TODO: Replace with autoAddDriverRunpath
        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
        cudaPackages.autoAddOpenGLRunpathHook
      ]
      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
        glibc.static
      ];
    buildInputs =
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs
      ++ optionals useVulkan vulkanBuildInputs;
    cmakeFlags =
      [
        (cmakeBool "LLAMA_NATIVE" false)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
        (cmakeBool "LLAMA_CUBLAS" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
        (cmakeBool "LLAMA_VULKAN" useVulkan)
        (cmakeBool "LLAMA_STATIC" enableStatic)
      ]
      ++ optionals useCuda [
        (
          with cudaPackages.flags;
          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
          )
        )
      ]
      ++ optionals useRocm [
        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
        # and select the line that matches the current nixpkgs version of rocBLAS.
        # Should likely use `rocmPackages.clr.gpuTargets`.
        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      ]
      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
      mv $out/bin/main $out/bin/llama
      mv $out/bin/server $out/bin/llama-server
      mkdir -p $out/include
      cp $src/llama.h $out/include/
    '';
    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
    passthru = {
      inherit
        useBlas
        useCuda
        useMetalKit
        useMpi
        useOpenCL
        useRocm
        useVulkan
        ;
      shell = mkShell {
        name = "shell-${finalAttrs.finalPackage.name}";
        description = "contains numpy and sentencepiece";
        buildInputs = [ llama-python ];
        inputsFrom = [ finalAttrs.finalPackage ];
        shellHook = ''
          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
        '';
      };
      shell-extra = mkShell {
        name = "shell-extra-${finalAttrs.finalPackage.name}";
        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
        buildInputs = [ llama-python-extra ];
        inputsFrom = [ finalAttrs.finalPackage ];
      };
    };
    meta = {
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
      broken = (useMetalKit && !effectiveStdenv.isDarwin);
      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
      homepage = "https://github.com/ggerganov/llama.cpp/";
      license = lib.licenses.mit;
      # Accommodates `nix run` and `lib.getExe`
      mainProgram = "llama";
      # These people might respond, on the best effort basis, if you ping them
      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
      # Consider adding yourself to this list if you want to ensure this flake
      # stays maintained and you're willing to invest your time. Do not add
      # other people without their consent. Consider removing people after
      # they've been unreachable for long periods of time.
      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
      # an attrset following the same format as in
      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
      maintainers = with lib.maintainers; [
        philiptaron
        SomeoneSerge
      ];
      # Extend `badPlatforms` instead
      platforms = lib.platforms.all;
    };
  }
 )
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -0,0 +1,19 @@
 {
  lib,
  newScope,
  llamaVersion ? "0.0.0",
 }:
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
    llama-cpp = self.callPackage ./package.nix { };
    docker = self.callPackage ./docker.nix { };
    docker-min = self.callPackage ./docker.nix { interactive = false; };
    sif = self.callPackage ./sif.nix { };
  }
 )
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@ -0,0 +1,27 @@
 {
  lib,
  singularity-tools,
  llama-cpp,
  bashInteractive,
  interactive ? false,
 }:
 let
  optionalInt = cond: x: if cond then x else 0;
 in
 singularity-tools.buildImage rec {
  inherit (llama-cpp) name;
  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
  # These are excessive (but safe) for most variants. Building singularity
  # images requires superuser privileges, so we build them inside a VM in a
  # writable image of pre-determined size.
  #
  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
  #
  # Expected image sizes:
  # - cpu/blas: 150M,
  # - cuda, all gencodes: 560M,
  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
  memSize = diskSize;
 }
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@ -0,0 +1,32 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
 ENV LLAMA_CUBLAS=1
 RUN make
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 COPY --from=build /app/server /server
 ENTRYPOINT [ "/server" ]
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@ -0,0 +1,28 @@
 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 RUN mkdir build && \
    cd build && \
    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build . --config Release --target server
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 COPY --from=build /app/build/bin/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/server" ]
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@ -0,0 +1,45 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server-vulkan.Dockerfile
+++ b/.devops/server-vulkan.Dockerfile
@ -0,0 +1,29 @@
 ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION as build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk
 # Build it
 WORKDIR /app
 COPY . .
 RUN mkdir build && \
    cd build && \
    cmake .. -DLLAMA_VULKAN=1 && \
    cmake --build . --config Release --target server
 # Clean up
 WORKDIR /
 RUN cp /app/build/bin/server /server && \
    rm -rf /app
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@ -0,0 +1,20 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 RUN make
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/server" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -13,6 +13,8 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    ./main "$@"
 elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
    ./finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@ -34,6 +36,8 @@ else
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
    echo "              See documentation for finetune for command-line parameters"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
--- a/.ecrc
+++ b/.ecrc
@ -1,4 +1,5 @@
 {
  "Exclude": ["^\\.gitmodules$"],
  "Disable": {
    "IndentSize": true
  }
--- a/.editorconfig
+++ b/.editorconfig
@ -15,8 +15,14 @@ indent_size = 4
 [Makefile]
 indent_style = tab
 [scripts/*.mk]
 indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
--- a/.flake8
+++ b/.flake8
@ -1,2 +1,3 @@
 [flake8]
 max-line-length = 125
 ignore = W503
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@ -6,179 +6,6 @@ assignees: ''
 ---
-# Prerequisites
+Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
-Please answer the following questions for yourself before submitting an issue.
+If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
 - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
 - [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
 - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
 - [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
 # Expected Behavior
 Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
 # Current Behavior
 Please provide a detailed written description of what `llama.cpp` did, instead.
 # Environment and Context
 Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
 * Physical (or virtual) hardware you are using, e.g. for Linux:
 `$ lscpu`
 * Operating System, e.g. for Linux:
 `$ uname -a`
 * SDK version, e.g. for Linux:
 ```
 $ python3 --version
 $ make --version
 $ g++ --version
 ```
 # Failure Information (for bugs)
 Please help provide information about the failure / bug.
 # Steps to Reproduce
 Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
 1. step 1
 2. step 2
 3. step 3
 4. etc.
 # Failure Logs
 Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
 Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
 Example environment info:
 ```
 llama.cpp$ git log | head -1
 commit 2af23d30434a677c6416812eea52ccc0af65119c
 llama.cpp$ lscpu | egrep "AMD|Flags"
 Vendor ID:                       AuthenticAMD
 Model name:                      AMD Ryzen Threadripper 1950X 16-Core Processor
 Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
 Virtualization:                  AMD-V
 llama.cpp$ python3 --version
 Python 3.10.9
 llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
 numpy                         1.24.2
 numpydoc                      1.5.0
 sentencepiece                 0.1.97
 torch                         1.13.1
 torchvision                   0.14.1
 llama.cpp$ make --version | head -1
 GNU Make 4.3
 $ md5sum ./models/65B/ggml-model-q4_0.bin
 dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
 ```
 Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
 ```
 llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
 main: seed = 1679149377
 llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
 llama_model_load: n_vocab = 32000
 llama_model_load: n_ctx   = 512
 llama_model_load: n_embd  = 8192
 llama_model_load: n_mult  = 256
 llama_model_load: n_head  = 64
 llama_model_load: n_layer = 80
 llama_model_load: n_rot   = 128
 llama_model_load: f16     = 2
 llama_model_load: n_ff    = 22016
 llama_model_load: n_parts = 8
 llama_model_load: ggml ctx size = 41477.73 MB
 llama_model_load: memory_size =  2560.00 MB, n_mem = 40960
 llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723
 llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723
 llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723
 llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723
 llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723
 llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723
 llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723
 llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723
 system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
 main: prompt: 'Please close your issue when it has been answered.'
 main: number of tokens in prompt = 11
     1 -> ''
 12148 -> 'Please'
  3802 -> ' close'
   596 -> ' your'
  2228 -> ' issue'
   746 -> ' when'
   372 -> ' it'
   756 -> ' has'
  1063 -> ' been'
  7699 -> ' answered'
 29889 -> '.'
 sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
 Please close your issue when it has been answered.
@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
 I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
 main: mem per token = 71159620 bytes
 main:     load time = 19309.95 ms
 main:   sample time =   168.62 ms
 main:  predict time = 223895.61 ms / 888.47 ms per token
 main:    total time = 246406.42 ms
 Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
        3636882.89 msec task-clock                #   14.677 CPUs utilized
             13509      context-switches          #    3.714 /sec
              2436      cpu-migrations            #    0.670 /sec
          10476679      page-faults               #    2.881 K/sec
    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
    23479217109614      instructions              #    1.79  insn per cycle
                                                  #    0.44  stalled cycles per insn  (16.76%)
     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
     247.802177522 seconds time elapsed
    3618.573072000 seconds user
      18.491698000 seconds sys
 ```
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -37,6 +37,8 @@ jobs:
      - name: Build
        id: make_build
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
          CC=gcc-8 make -j $(nproc)
@ -65,14 +67,14 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake ..
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON
          cmake --build . --config Release -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@ -100,14 +102,14 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
      - name: Test
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
  ubuntu-latest-cmake-mpi:
    runs-on: ubuntu-latest
@ -141,8 +143,115 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --verbose
+          ctest -L main --verbose
  ubuntu-22-cmake-vulkan:
    runs-on: ubuntu-22.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libvulkan-dev
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake -DLLAMA_VULKAN=ON ..
          cmake --build . --config Release -j $(nproc)
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04
    continue-on-error: true
    steps:
      - uses: actions/checkout@v2
      - name: add oneAPI to apt
        shell: bash
        run: |
          cd /tmp
          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
      - name: install oneAPI dpcpp compiler
        shell: bash
        run: |
          sudo apt update
          sudo apt install intel-oneapi-compiler-dpcpp-cpp
      - name: install oneAPI MKL library
        shell: bash
        run: |
          sudo apt install intel-oneapi-mkl-devel
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)
  ubuntu-22-cmake-sycl-fp16:
    runs-on: ubuntu-22.04
    continue-on-error: true
    steps:
      - uses: actions/checkout@v2
      - name: add oneAPI to apt
        shell: bash
        run: |
          cd /tmp
          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
      - name: install oneAPI dpcpp compiler
        shell: bash
        run: |
          sudo apt update
          sudo apt install intel-oneapi-compiler-dpcpp-cpp
      - name: install oneAPI MKL library
        shell: bash
        run: |
          sudo apt install intel-oneapi-mkl-devel
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
          cmake --build . --config Release -j $(nproc)
  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
    runs-on: macos-latest
@ -159,15 +268,21 @@ jobs:
      - name: Build
        id: make_build
        env:
            LLAMA_FATAL_WARNINGS: 1
        run: |
-          make -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: make_test
        run: |
-          make tests -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          make test -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
  #       how to debug it.
  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
  #       would be great if we fix these
  macOS-latest-cmake:
    runs-on: macos-latest
@ -188,14 +303,14 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
        id: cmake_test
        run: |
          cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
  macOS-latest-cmake-ios:
    runs-on: macos-latest
@ -288,7 +403,8 @@ jobs:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
-      SDE_VERSION: 9.21.1-2023-04-24
+      SDE_VERSION: 9.33.0-2024-01-07
      VULKAN_VERSION: 1.3.261.1
    strategy:
      matrix:
@ -305,6 +421,10 @@ jobs:
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
          - build: 'kompute'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'vulkan'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
    steps:
      - name: Clone
@ -313,6 +433,12 @@ jobs:
        with:
          fetch-depth: 0
      - name: Clone Kompute submodule
        id: clone_kompute
        if: ${{ matrix.build == 'kompute' }}
        run: |
          git submodule update --init kompute
      - name: Download OpenCL SDK
        id: get_opencl
        if: ${{ matrix.build == 'clblast' }}
@ -347,6 +473,15 @@ jobs:
          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
      - name: Install Vulkan SDK
        id: get_vulkan
        if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
      - name: Build
        id: cmake_build
        run: |
@ -384,22 +519,23 @@ jobs:
      - name: Test
        id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
+        # not all machines have native AVX-512
        if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
-          ctest -C Release --verbose --timeout 900
+          ctest -L main -C Release --verbose --timeout 900
      - name: Test (Intel SDE)
        id: cmake_test_sde
        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
          cd build
-          & $sde -future -- ctest -C Release --verbose --timeout 900
+          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
      - name: Determine tag name
        id: tag
@ -498,6 +634,65 @@ jobs:
          path: |
            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
  windows-latest-cmake-sycl:
    runs-on: windows-latest
    defaults:
      run:
        shell: bash
    env:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
        with:
          fetch-depth: 0
      - name: Install
        run:  scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
      - name: Build
        id: cmake_build
        run:  examples/sycl/win-build-sycl.bat
  ios-xcode-build:
    runs-on: macos-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
  android-build:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v3
      - name: Set up JDK
        uses: actions/setup-java@v3
        with:
          java-version: 17
          distribution: zulu
      - name: Setup Android SDK
        uses: android-actions/setup-android@v3
        with:
          log-accepted-android-sdk-licenses: false
      - name: Build
        run: |
          cd examples/llama.android
          ./gradlew build --no-daemon
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -27,15 +27,20 @@ jobs:
    strategy:
      matrix:
        config:
-          # - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          # - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          #- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
          #                     have disabled them for now until the reason why
          #                     is understood.
          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
-          # - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          #- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          # - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
-          # - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          #- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          #- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          #- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
          #- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
@ -53,6 +58,36 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
      - name: Free Disk Space (Ubuntu)
        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
          tool-cache: false
          # all of these default to true, but feel free to set to
          # "false" if necessary for your workflow
          android: true
          dotnet: true
          haskell: true
          large-packages: true
          docker-images: true
          swap-storage: true
      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@ -1,6 +1,12 @@
 name: EditorConfig Checker
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
        description: 'Create new release'
        required: true
        type: boolean
  push:
    branches:
      - master
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -0,0 +1,61 @@
 name: Nix aarch64 builds
 on:
  workflow_dispatch: # allows manual triggering
  schedule:
    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
    # 1.5h instead of minutes with the cold cache).
    #
    # randint(0, 59), randint(0, 23)
    - cron: '26 12 * * *'
  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install QEMU
      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
      run: |
        sudo apt-get update
        sudo apt-get install -y qemu-user-static qemu-system-aarch64
        sudo usermod -a -G kvm $USER
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: llama-cpp
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.aarch64-linux"
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --systems aarch64-linux
          --flake
          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -0,0 +1,68 @@
 name: Nix CI
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
  pull_request:
    types: [opened, synchronize, reopened]
 jobs:
  nix-eval:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: List all flake outputs
      run: nix flake show --all-systems
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: llama-cpp
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --flake
          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@ -0,0 +1,22 @@
 name: update-flake-lock
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
 jobs:
  lockfile:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install Nix
        uses: DeterminateSystems/nix-installer-action@main
      - name: Update flake.lock
        uses: DeterminateSystems/update-flake-lock@main
        with:
          pr-title: "nix: update flake.lock"
          pr-labels: |
            nix
          pr-reviewers: philiptaron,SomeoneSerge
          token: ${{ secrets.FLAKE_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@ -0,0 +1,36 @@
 # Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
 name: "Publish a flake to flakestry & flakehub"
 on:
    push:
        tags:
        - "*"
    workflow_dispatch:
        inputs:
            tag:
                description: "The existing tag to publish"
                type: "string"
                required: true
 jobs:
    flakestry-publish:
        runs-on: ubuntu-latest
        permissions:
            id-token: "write"
            contents: "read"
        steps:
            - uses: flakestry/flakestry-publish@main
              with:
                version: "${{ inputs.tag || github.ref_name }}"
    flakehub-publish:
      runs-on: "ubuntu-latest"
      permissions:
        id-token: "write"
        contents: "read"
      steps:
        - uses: "actions/checkout@v4"
          with:
            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
        - uses: "DeterminateSystems/nix-installer-action@main"
        - uses: "DeterminateSystems/flakehub-push@main"
          with:
            visibility: "public"
            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@ -0,0 +1,31 @@
 name: Python check requirements.txt
 on:
  push:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
 jobs:
  python-check-requirements:
    runs-on: ubuntu-latest
    name: check-requirements
    steps:
      - name: Check out source repository
        uses: actions/checkout@v3
      - name: Set up Python environment
        uses: actions/setup-python@v4
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
        run:  bash scripts/check-requirements.sh
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -16,5 +16,5 @@ jobs:
      - name: flake8 Lint
        uses: py-actions/flake8@v2
        with:
-            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
+            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
            exclude: "examples/*,examples/*/**,*/**/__init__.py"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -0,0 +1,92 @@
 # Server build and tests
 name: Server
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      slow_tests:
        description: 'Run slow tests'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
  schedule:
    -  cron: '0 0 * * *'
 jobs:
  server:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [Debug, Release]
        include:
          - build_type: Release
            sanitizer: ""
        exclude:
          - build_type: Release
            sanitizer: ADDRESS
          - build_type: Release
            sanitizer: THREAD
          - build_type: Release
            sanitizer: UNDEFINED
    container:
      image: ubuntu:latest
      ports:
        - 8888
      options: --cpus 4
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v3
      - name: Dependencies
        id: depends
        run: |
          apt-get update
          apt-get -y install \
            build-essential \
            git \
            cmake \
            python3-pip \
            wget \
            psmisc \
            language-pack-en
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake .. \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Tests
        id: server_integration_tests
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
        run: |
          cd examples/server/tests
          PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
--- a/.gitignore
+++ b/.gitignore
@ -23,11 +23,13 @@
 .clang-tidy
 .vs/
 .vscode/
 .idea/
 lcov-report/
 gcovr-report/
-build*/
+build*
 cmake-build-*
 out/
 tmp/
@ -43,12 +45,16 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
 /imatrix
 /infill
 /libllama.so
 /llama-bench
 /llava-cli
 /lookahead
 /lookup
 /main
 /metal
 /passkey
 /perplexity
 /q8dot
 /quantize
@ -85,17 +91,4 @@ examples/jeopardy/results.txt
 poetry.lock
 poetry.toml
-
+nppBackup
 # Test binaries
 tests/test-grammar-parser
 tests/test-llama-grammar
 tests/test-double-float
 tests/test-grad0
 tests/test-opt
 tests/test-quantize-fns
 tests/test-quantize-perf
 tests/test-sampling
 tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
 [submodule "kompute"]
 	path = kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,6 @@
-cmake_minimum_required(VERSION 3.13)  # for add_link_options
+cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
 include(CheckIncludeFileCXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -43,15 +44,20 @@ else()
 endif()
 # general
 option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)
 # debug
 option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
 option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
 option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)
 # build
 option(LLAMA_FATAL_WARNINGS             "llama: enable -Werror flag"                            OFF)
 # sanitizers
 option(LLAMA_SANITIZE_THREAD            "llama: enable thread sanitizer"                        OFF)
 option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"                       OFF)
@ -75,6 +81,10 @@ if (NOT MSVC)
    option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
 endif()
 if (WIN32)
    set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
 endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
@ -90,42 +100,71 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
 option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
 option(LLAMA_VULKAN_VALIDATE                 "llama: enable Vulkan validation"                  OFF)
 option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"                          OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_METAL_EMBED_LIBRARY             "llama: embed Metal library"                       OFF)
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 option(LLAMA_CPU_HBM                         "llama: use memkind for CPU HBM"                   OFF)
-option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)
+option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
 # add perf arguments
 option(LLAMA_PERF                            "llama: enable perf"                               OFF)
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 #
 # Compile flags
 #
-set(CMAKE_CXX_STANDARD 11)
+if (LLAMA_SYCL)
    set(CMAKE_CXX_STANDARD 17)
 else()
    set(CMAKE_CXX_STANDARD 11)
 endif()
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)
 # enable libstdc++ assertions for debug builds
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
 endif()
 if (NOT MSVC)
    if (LLAMA_SANITIZE_THREAD)
        add_compile_options(-fsanitize=thread)
-        link_libraries(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
    endif()
    if (LLAMA_SANITIZE_ADDRESS)
        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-        link_libraries(-fsanitize=address)
+        link_libraries     (-fsanitize=address)
    endif()
    if (LLAMA_SANITIZE_UNDEFINED)
        add_compile_options(-fsanitize=undefined)
-        link_libraries(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
    endif()
 endif()
@ -144,9 +183,9 @@ if (APPLE AND LLAMA_ACCELERATE)
 endif()
 if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
    message(STATUS "Metal framework found")
    set(GGML_HEADERS_METAL ggml-metal.h)
@ -161,7 +200,59 @@ if (LLAMA_METAL)
    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
+    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
    if (LLAMA_METAL_EMBED_LIBRARY)
        enable_language(ASM)
        add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
        set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
        file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
        set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
        add_custom_command(
            OUTPUT ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
            COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
            DEPENDS ${METALLIB_SOURCE}
            COMMENT "Generate assembly for embedded Metal library"
        )
        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
    endif()
    if (LLAMA_METAL_SHADER_DEBUG)
        # custom command to do the following:
        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
        #
        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
        #       disabling fast math is needed in order to pass tests/test-backend-ops
        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
        set(XC_FLAGS -fno-fast-math -fno-inline -g)
        if (LLAMA_QKK_64)
            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
        endif()
        add_custom_command(
            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
            DEPENDS ggml-metal.metal
            COMMENT "Compiling Metal kernels"
        )
        add_custom_target(
            ggml-metal ALL
            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
        )
    endif()
    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
@ -190,7 +281,11 @@ if (LLAMA_BLAS)
            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
                pkg_check_modules(DepBLAS REQUIRED blas)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                pkg_check_modules(DepBLAS REQUIRED openblas)
+                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
                pkg_check_modules(DepBLAS openblas64)
                if (NOT DepBLAS_FOUND)
                    pkg_check_modules(DepBLAS REQUIRED openblas)
                endif()
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
                pkg_check_modules(DepBLAS REQUIRED blis)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
@ -229,14 +324,17 @@ if (LLAMA_BLAS)
        endif()
        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
        add_compile_options(${BLAS_LINKER_FLAGS})
        add_compile_definitions(GGML_USE_OPENBLAS)
        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
            add_compile_definitions(GGML_BLAS_USE_MKL)
        endif()
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -261,9 +359,6 @@ if (LLAMA_CUBLAS)
        set(GGML_SOURCES_CUDA ggml-cuda.cu)
        add_compile_definitions(GGML_USE_CUBLAS)
 #        if (LLAMA_CUDA_CUBLAS)
 #            add_compile_definitions(GGML_CUDA_CUBLAS)
 #        endif()
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@ -282,11 +377,18 @@ if (LLAMA_CUBLAS)
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
        if (LLAMA_STATIC)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            if (WIN32)
                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
            endif()
        else()
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
        # 60 == f16 CUDA intrinsics
@ -311,15 +413,20 @@ if (LLAMA_MPI)
    find_package(MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
        set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
+        set(GGML_SOURCES_MPI ggml-mpi.c)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
        if (NOT MSVC)
            add_compile_options(-Wno-cast-qual)
        endif()
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
        # Even if you're only using the C header, C++ programs may bring in MPI
        # C++ functions, so more linkage is needed
        if (MPI_CXX_FOUND)
@ -346,100 +453,394 @@ if (LLAMA_CLBLAST)
    endif()
 endif()
 if (LLAMA_VULKAN)
    find_package(Vulkan)
    if (Vulkan_FOUND)
        message(STATUS "Vulkan found")
        set(GGML_HEADERS_VULKAN ggml-vulkan.h)
        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
        add_compile_definitions(GGML_USE_VULKAN)
        if (LLAMA_VULKAN_CHECK_RESULTS)
            add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
        endif()
        if (LLAMA_VULKAN_DEBUG)
            add_compile_definitions(GGML_VULKAN_DEBUG)
        endif()
        if (LLAMA_VULKAN_VALIDATE)
            add_compile_definitions(GGML_VULKAN_VALIDATE)
        endif()
        if (LLAMA_VULKAN_RUN_TESTS)
            add_compile_definitions(GGML_VULKAN_RUN_TESTS)
        endif()
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
    else()
        message(WARNING "Vulkan not found")
    endif()
 endif()
 if (LLAMA_HIPBLAS)
    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
    endif()
    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
    endif()
-    find_package(hip)
+    find_package(hip     REQUIRED)
-    find_package(hipblas)
+    find_package(hipblas REQUIRED)
-    find_package(rocblas)
+    find_package(rocblas REQUIRED)
-    if (${hipblas_FOUND} AND ${hip_FOUND})
+    message(STATUS "HIP and hipBLAS found")
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        if (BUILD_SHARED_LIBS)
            set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
        endif()
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
        endif()
        if (LLAMA_CUDA_FORCE_MMQ)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
        endif()
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
-        if (LLAMA_STATIC)
+    set(GGML_HEADERS_ROCM ggml-cuda.h)
-            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+    set(GGML_SOURCES_ROCM ggml-cuda.cu)
-        endif()
+
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
+    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
    if (LLAMA_HIP_UMA)
        add_compile_definitions(GGML_HIP_UMA)
    endif()
    if (LLAMA_CUDA_FORCE_DMMV)
        add_compile_definitions(GGML_CUDA_FORCE_DMMV)
    endif()
    if (LLAMA_CUDA_FORCE_MMQ)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
    set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
    if (LLAMA_STATIC)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
    endif()
    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 if (LLAMA_SYCL)
    if ( NOT DEFINED ENV{ONEAPI_ROOT})
        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
    endif()
    #todo: AOT
    find_package(IntelSYCL REQUIRED)
    message(STATUS "SYCL found")
    add_compile_definitions(GGML_USE_SYCL)
    if (LLAMA_SYCL_F16)
        add_compile_definitions(GGML_SYCL_F16)
    endif()
    add_compile_options(-I./) #include DPCT
    add_compile_options(-I/${SYCL_INCLUDE_DIR})
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
    set(GGML_HEADERS_SYCL ggml-sycl.h)
    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
    if (WIN32)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
    else()
-        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
    endif()
 endif()
 if (LLAMA_KOMPUTE)
    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
    find_package(Vulkan COMPONENTS glslc REQUIRED)
    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
    if (NOT glslc_executable)
        message(FATAL_ERROR "glslc not found")
    endif()
    function(compile_shader)
        set(options)
        set(oneValueArgs)
        set(multiValueArgs SOURCES)
        cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
        foreach(source ${compile_shader_SOURCES})
            get_filename_component(filename ${source} NAME)
            set(spv_file ${filename}.spv)
            add_custom_command(
                OUTPUT ${spv_file}
                DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
                ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
                COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
                COMMENT "Compiling ${source} to ${spv_file}"
                )
            get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
            set(FILE_NAME "shader${RAW_FILE_NAME}")
            string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
            string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
            string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
            set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
            message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
            if(CMAKE_GENERATOR MATCHES "Visual Studio")
                add_custom_command(
                    OUTPUT ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
                    DEPENDS ${spv_file} xxd
                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
                    )
            else()
                add_custom_command(
                    OUTPUT ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
                    COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
                    DEPENDS ${spv_file} xxd
                    COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
                    )
            endif()
        endforeach()
    endfunction()
    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
        message(STATUS "Kompute found")
        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
        add_subdirectory(kompute)
        # Compile our shaders
        compile_shader(SOURCES
            kompute-shaders/op_scale.comp
            kompute-shaders/op_scale_8.comp
            kompute-shaders/op_add.comp
            kompute-shaders/op_addrow.comp
            kompute-shaders/op_mul.comp
            kompute-shaders/op_silu.comp
            kompute-shaders/op_relu.comp
            kompute-shaders/op_gelu.comp
            kompute-shaders/op_softmax.comp
            kompute-shaders/op_norm.comp
            kompute-shaders/op_rmsnorm.comp
            kompute-shaders/op_diagmask.comp
            kompute-shaders/op_mul_mat_mat_f32.comp
            kompute-shaders/op_mul_mat_f16.comp
            kompute-shaders/op_mul_mat_q8_0.comp
            kompute-shaders/op_mul_mat_q4_0.comp
            kompute-shaders/op_mul_mat_q4_1.comp
            kompute-shaders/op_mul_mat_q6_k.comp
            kompute-shaders/op_getrows_f16.comp
            kompute-shaders/op_getrows_q4_0.comp
            kompute-shaders/op_getrows_q4_1.comp
            kompute-shaders/op_getrows_q6_k.comp
            kompute-shaders/op_rope_f16.comp
            kompute-shaders/op_rope_f32.comp
            kompute-shaders/op_cpy_f16_f16.comp
            kompute-shaders/op_cpy_f16_f32.comp
            kompute-shaders/op_cpy_f32_f16.comp
            kompute-shaders/op_cpy_f32_f32.comp
        )
        # Create a custom target for our generated shaders
        add_custom_target(generated_shaders DEPENDS
            shaderop_scale.h
            shaderop_scale_8.h
            shaderop_add.h
            shaderop_addrow.h
            shaderop_mul.h
            shaderop_silu.h
            shaderop_relu.h
            shaderop_gelu.h
            shaderop_softmax.h
            shaderop_norm.h
            shaderop_rmsnorm.h
            shaderop_diagmask.h
            shaderop_mul_mat_mat_f32.h
            shaderop_mul_mat_f16.h
            shaderop_mul_mat_q8_0.h
            shaderop_mul_mat_q4_0.h
            shaderop_mul_mat_q4_1.h
            shaderop_mul_mat_q6_k.h
            shaderop_getrows_f16.h
            shaderop_getrows_q4_0.h
            shaderop_getrows_q4_1.h
            shaderop_getrows_q6_k.h
            shaderop_rope_f16.h
            shaderop_rope_f32.h
            shaderop_cpy_f16_f16.h
            shaderop_cpy_f16_f32.h
            shaderop_cpy_f32_f16.h
            shaderop_cpy_f32_f32.h
        )
        # Create a custom command that depends on the generated_shaders
        add_custom_command(
            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
            DEPENDS generated_shaders
            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
        )
        # Add the stamp to the main sources to ensure dependency tracking
        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
        set(GGML_HEADERS_KOMPUTE ggml-kompute.h   ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
        add_compile_definitions(GGML_USE_KOMPUTE)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
    else()
        message(WARNING "Kompute not found")
    endif()
 endif()
 if (LLAMA_CPU_HBM)
    find_library(memkind memkind REQUIRED)
    add_compile_definitions(GGML_USE_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
 if (LLAMA_PERF)
    add_compile_definitions(GGML_PERF)
 endif()
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
    if (CCID MATCHES "Clang")
        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
        if (
            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
        )
            list(APPEND C_FLAGS -Wdouble-promotion)
        endif()
    elseif (CCID STREQUAL "GNU")
        set(C_FLAGS   -Wdouble-promotion)
        set(CXX_FLAGS -Wno-array-bounds)
        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
            list(APPEND CXX_FLAGS -Wno-format-truncation)
        endif()
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
            list(APPEND CXX_FLAGS -Wextra-semi)
        endif()
    endif()
    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
 endfunction()
 if (LLAMA_FATAL_WARNINGS)
    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
        list(APPEND C_FLAGS   -Werror)
        list(APPEND CXX_FLAGS -Werror)
    elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
        add_compile_options(/WX)
    endif()
 endif()
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
+        list(APPEND C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
+                                  -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(host_cxx_flags "")
+        list(APPEND CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
-        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+        list(APPEND C_FLAGS   ${WARNING_FLAGS})
-            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
+        list(APPEND CXX_FLAGS ${WARNING_FLAGS})
            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
-            if (
+        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
                (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
            )
                set(c_flags ${c_flags} -Wdouble-promotion)
            endif()
        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
            set(c_flags ${c_flags} -Wdouble-promotion)
            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
+        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
+                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
            endif()
            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
            endif()
        endif()
    else()
        # todo : msvc
        set(C_FLAGS   "")
        set(CXX_FLAGS "")
    endif()
 endif()
 set(CUDA_CXX_FLAGS "")
 if (LLAMA_CUBLAS)
    set(CUDA_FLAGS -use_fast_math)
    if (LLAMA_FATAL_WARNINGS)
        list(APPEND CUDA_FLAGS -Werror all-warnings)
    endif()
-    set(c_flags   ${c_flags}   ${warning_flags})
+    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
-    set(cxx_flags ${cxx_flags} ${warning_flags})
+        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
+        endif()
        execute_process(
            COMMAND ${NVCC_CMD} -Xcompiler --version
            OUTPUT_VARIABLE CUDA_CCFULLVER
            ERROR_QUIET
        )
        if (NOT CUDA_CCFULLVER MATCHES clang)
            set(CUDA_CCID "GNU")
            execute_process(
                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
                OUTPUT_VARIABLE CUDA_CCVER
                ERROR_QUIET
            )
        else()
            if (CUDA_CCFULLVER MATCHES Apple)
                set(CUDA_CCID "AppleClang")
            else()
                set(CUDA_CCID "Clang")
            endif()
            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
        endif()
        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
        get_flags(${CUDA_CCID} ${CUDA_CCVER})
        list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
    endif()
    if (NOT MSVC)
        list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
    endif()
 endif()
 if (NOT MSVC)
    set(cuda_flags -Wno-pedantic)
 endif()
 set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
 list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
 if (NOT cuda_host_flags STREQUAL "")
    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@ -458,11 +859,24 @@ if (LLAMA_LTO)
    endif()
 endif()
 if (LLAMA_CCACHE)
    find_program(LLAMA_CCACHE_FOUND ccache)
    if (LLAMA_CCACHE_FOUND)
        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
        set(ENV{CCACHE_SLOPPINESS} time_macros)
        message(STATUS "ccache found, compilation results will be cached. Disable with LLAMA_CCACHE=OFF.")
    else()
        message(STATUS "Warning: ccache not found - consider installing it for faster compilation or disable this warning with LLAMA_CCACHE=OFF")
    endif ()
 endif()
 # this version of Apple ld64 is buggy
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
    ERROR_VARIABLE output
    OUTPUT_QUIET
 )
 if (output MATCHES "dyld-1015\.7")
    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
 endif()
@ -472,10 +886,10 @@ endif()
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 if (MSVC)
-  string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
+    string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
-  message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
+    message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
 else ()
-  set(CMAKE_GENERATOR_PLATFORM_LWR "")
+    set(CMAKE_GENERATOR_PLATFORM_LWR "")
 endif ()
 if (NOT MSVC)
@ -490,33 +904,55 @@ if (NOT MSVC)
    endif()
 endif()
-if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
+set(ARCH_FLAGS "")
 if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
    (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
     CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
    message(STATUS "ARM detected")
    if (MSVC)
        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
        add_compile_definitions(__ARM_NEON)
        add_compile_definitions(__ARM_FEATURE_FMA)
-        add_compile_definitions(__ARM_FEATURE_DOTPROD)
+
-        # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
+        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
+        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
        if (GGML_COMPILER_SUPPORT_DOTPROD)
            add_compile_definitions(__ARM_FEATURE_DOTPROD)
        endif ()
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
        endif ()
        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
    else()
        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            add_compile_options(-mfp16-format=ieee)
+            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
            # Raspberry Pi 1, Zero
-            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            # Raspberry Pi 2
+            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
-            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+                # Android armeabi-v7a
                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
            else()
                # Raspberry Pi 2
                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
            endif()
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Android arm64-v8a
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            add_compile_options(-mno-unaligned-access)
+            list(APPEND ARCH_FLAGS -mno-unaligned-access)
        endif()
    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
+elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
    message(STATUS "x86 detected")
    if (MSVC)
        # instruction set detection for MSVC only
@ -524,8 +960,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
            include(cmake/FindSIMD.cmake)
        endif ()
        if (LLAMA_AVX512)
-            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
+            list(APPEND ARCH_FLAGS /arch:AVX512)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
@ -539,51 +974,66 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
        elseif (LLAMA_AVX2)
-            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
+            list(APPEND ARCH_FLAGS /arch:AVX2)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
        elseif (LLAMA_AVX)
-            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
+            list(APPEND ARCH_FLAGS /arch:AVX)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
        endif()
    else()
        if (LLAMA_NATIVE)
-            add_compile_options(-march=native)
+            list(APPEND ARCH_FLAGS -march=native)
        endif()
        if (LLAMA_F16C)
-            add_compile_options(-mf16c)
+            list(APPEND ARCH_FLAGS -mf16c)
        endif()
        if (LLAMA_FMA)
-            add_compile_options(-mfma)
+            list(APPEND ARCH_FLAGS -mfma)
        endif()
        if (LLAMA_AVX)
-            add_compile_options(-mavx)
+            list(APPEND ARCH_FLAGS -mavx)
        endif()
        if (LLAMA_AVX2)
-            add_compile_options(-mavx2)
+            list(APPEND ARCH_FLAGS -mavx2)
        endif()
        if (LLAMA_AVX512)
-            add_compile_options(-mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512f)
-            add_compile_options(-mavx512bw)
+            list(APPEND ARCH_FLAGS -mavx512bw)
        endif()
        if (LLAMA_AVX512_VBMI)
-            add_compile_options(-mavx512vbmi)
+            list(APPEND ARCH_FLAGS -mavx512vbmi)
        endif()
        if (LLAMA_AVX512_VNNI)
-            add_compile_options(-mavx512vnni)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        add_compile_options(-mcpu=powerpc64le)
+        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
    else()
-        add_compile_options(-mcpu=native -mtune=native)
+        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
 else()
    message(STATUS "Unknown architecture")
 endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
 if (LLAMA_CUBLAS)
    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
    endif()
    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
 endif()
 if (MINGW)
    # Target Windows 8 for PrefetchVirtualMemory
    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
 endif()
 #
 # POSIX conformance
 #
@ -639,11 +1089,6 @@ endif()
 # ggml
 if (GGML_USE_CPU_HBM)
    add_definitions(-DGGML_USE_CPU_HBM)
    find_library(memkind memkind REQUIRED)
 endif()
 add_library(ggml OBJECT
            ggml.c
            ggml.h
@ -653,21 +1098,24 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_CUDA}    ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
+            ${GGML_SOURCES_OPENCL}  ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_METAL}   ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_MPI}     ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_EXTRA}   ${GGML_HEADERS_EXTRA}
            ${GGML_SOURCES_SYCL}    ${GGML_HEADERS_SYCL}
            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
            ${GGML_SOURCES_VULKAN}  ${GGML_HEADERS_VULKAN}
            ${GGML_SOURCES_ROCM}    ${GGML_HEADERS_ROCM}
            )
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
-target_compile_features(ggml PUBLIC c_std_11) # don't bump
+target_compile_features   (ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 if (GGML_USE_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
 add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
    add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
@ -683,7 +1131,8 @@ add_library(llama
            )
 target_include_directories(llama PUBLIC .)
-target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_compile_features   (llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE
    ggml
    ${LLAMA_EXTRA_LIBS}
@ -733,8 +1182,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
-set(GGML_PUBLIC_HEADERS "ggml.h"
+set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
-        "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
+        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
--- a/474
+++ b/474
@ -1,14 +1,15 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
-	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
 	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -25,20 +26,6 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 ifeq '' '$(findstring clang,$(shell $(CC) --version))'
 	CC_IS_GCC=1
 	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 else
 	CC_IS_CLANG=1
 	ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
 		CC_IS_LLVM_CLANG=1
 	else
 		CC_IS_APPLE_CLANG=1
 	endif
 	CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
 				| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 endif
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@ -56,10 +43,6 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif
 ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
 BUILD_TARGETS += metal
 endif
 default: $(BUILD_TARGETS)
 test: $(TEST_TARGETS)
@ -78,7 +61,7 @@ test: $(TEST_TARGETS)
 			./$$test_target; \
 		fi; \
 		if [ $$? -ne 0 ]; then \
-			printf 'Test $$test_target FAILED!\n\n' $$test_target; \
+			printf 'Test %s FAILED!\n\n' $$test_target; \
 			failures=$$(( failures + 1 )); \
 		else \
 			printf 'Test %s passed.\n\n' $$test_target; \
@ -114,20 +97,34 @@ endif
 #
 # keep standard at C11 and C++11
-MK_CPPFLAGS = -I. -Icommon
+MK_CPPFLAGS  = -I. -Icommon
-MK_CFLAGS   = -std=c11   -fPIC
+MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
+MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
-MK_CFLAGS        += -Ofast
+MK_CFLAGS     += -Ofast
-MK_HOST_CXXFLAGS += -Ofast
+HOST_CXXFLAGS += -Ofast
-MK_CUDA_CXXFLAGS += -O3
+MK_NVCCFLAGS  += -O3
 else
-MK_CFLAGS        += -O3
+MK_CFLAGS     += -O3
-MK_CXXFLAGS      += -O3
+MK_CXXFLAGS   += -O3
 MK_NVCCFLAGS  += -O3
 endif
 ifndef LLAMA_NO_CCACHE
 CCACHE := $(shell which ccache)
 ifdef CCACHE
 export CCACHE_SLOPPINESS = time_macros
 $(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
 CC    := $(CCACHE) $(CC)
 CXX   := $(CCACHE) $(CXX)
 else
 $(info I ccache not found. Consider installing it for faster compilation.)
 endif # CCACHE
 endif # LLAMA_NO_CCACHE
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
@ -174,6 +171,10 @@ ifdef LLAMA_DEBUG
 	MK_CFLAGS   += -O0 -g
 	MK_CXXFLAGS += -O0 -g
 	MK_LDFLAGS  += -g
 	ifeq ($(UNAME_S),Linux)
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
 	MK_CPPFLAGS += -DNDEBUG
 endif
@ -215,28 +216,9 @@ MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
 				-Werror=implicit-function-declaration
 MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
-ifeq ($(CC_IS_CLANG), 1)
+ifeq ($(LLAMA_FATAL_WARNINGS),1)
-	# clang options
+	MK_CFLAGS   += -Werror
-	MK_CFLAGS        += -Wunreachable-code-break -Wunreachable-code-return
+	MK_CXXFLAGS += -Werror
 	MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
 	ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
 		MK_CFLAGS += -Wdouble-promotion
 	endif
 	ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
 		MK_CFLAGS += -Wdouble-promotion
 	endif
 else
 	# gcc options
 	MK_CFLAGS        += -Wdouble-promotion
 	MK_HOST_CXXFLAGS += -Wno-array-bounds
 	ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
 		MK_HOST_CXXFLAGS += -Wno-format-truncation
 	endif
 	ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
 		MK_HOST_CXXFLAGS += -Wextra-semi
 	endif
 endif
 # this version of Apple ld64 is buggy
@ -289,8 +271,8 @@ ifndef RISCV
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
-	MK_CFLAGS   += -march=native -mtune=native
+	MK_CFLAGS     += -march=native -mtune=native
-	MK_HOST_CXXFLAGS += -march=native -mtune=native
+	HOST_CXXFLAGS += -march=native -mtune=native
 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
@ -301,19 +283,31 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#MK_CXXFLAGS += -mssse3
 endif
 # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
 # https://github.com/ggerganov/llama.cpp/issues/2922
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
 	# https://github.com/ggerganov/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
 	# Target Windows 8 for PrefetchVirtualMemory
 	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
 	# Nvidia Jetson
 	MK_CFLAGS   += -mcpu=native
 	MK_CXXFLAGS += -mcpu=native
 	JETSON_RELEASE_INFO = $(shell jetson_release)
 	ifdef JETSON_RELEASE_INFO
 		ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
 			JETSON_EOL_MODULE_DETECT = 1
 			CC = aarch64-unknown-linux-gnu-gcc
 			cxx = aarch64-unknown-linux-gnu-g++
 		endif
 	endif
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
@ -387,64 +381,80 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS
 ifdef LLAMA_CUBLAS
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	ifneq ('', '$(wildcard /opt/cuda)')
-	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+		CUDA_PATH ?= /opt/cuda
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
 	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
-	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
+	MK_NVCCFLAGS += -use_fast_math
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
 endif # LLAMA_DEBUG
 ifdef LLAMA_CUDA_NVCC
-	NVCC = $(LLAMA_CUDA_NVCC)
+	NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
 else
-	NVCC = nvcc
+	NVCC = $(CCACHE) nvcc
 endif #LLAMA_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
-	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
-else ifdef CUDA_POWER_ARCH
+else ifndef CUDA_POWER_ARCH
-	NVCCFLAGS +=
+	MK_NVCCFLAGS += -arch=native
 else
 	NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
-	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_FORCE_MMQ
-	NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # LLAMA_CUDA_FORCE_MMQ
 ifdef LLAMA_CUDA_DMMV_X
-	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
-	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
 ifdef LLAMA_CUDA_MMV_Y
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 else ifdef LLAMA_CUDA_DMMV_Y
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
 else
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
 ifdef LLAMA_CUDA_F16
-	NVCCFLAGS += -DGGML_CUDA_F16
+	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_F16
 ifdef LLAMA_CUDA_DMMV_F16
-	NVCCFLAGS += -DGGML_CUDA_F16
+	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_DMMV_F16
 ifdef LLAMA_CUDA_KQUANTS_ITER
-	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
-	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
 else
-	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 #ifdef LLAMA_CUDA_CUBLAS
-#	NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
-	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
+	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+ifdef JETSON_EOL_MODULE_DETECT
 	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 else
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # JETSON_EOL_MODULE_DETECT
 endif # LLAMA_CUBLAS
 ifdef LLAMA_CLBLAST
@ -465,14 +475,48 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST
 ifdef LLAMA_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
 	MK_LDFLAGS += -lvulkan
 	OBJS    += ggml-vulkan.o
 ifdef LLAMA_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
 endif
 ifdef LLAMA_VULKAN_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_DEBUG
 endif
 ifdef LLAMA_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
 ifdef LLAMA_VULKAN_RUN_TESTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
 endif
 ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_VULKAN
 ifdef LLAMA_HIPBLAS
-	ROCM_PATH	?= /opt/rocm
+
-	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
+	ifeq ($(wildcard /opt/rocm),)
-	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+		ROCM_PATH	?= /usr
 		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
 		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 	HIPCC                   ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
 ifdef LLAMA_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
@ -494,11 +538,29 @@ ifdef LLAMA_METAL
 ifdef LLAMA_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
 ifdef LLAMA_METAL_EMBED_LIBRARY
 	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
 	OBJS        += ggml-metal-embed.o
 endif
 endif # LLAMA_METAL
 ifdef LLAMA_METAL
 ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
 ifdef LLAMA_METAL_EMBED_LIBRARY
 ggml-metal-embed.o: ggml-metal.metal
 	@echo "Embedding Metal library"
 	$(eval TEMP_ASSEMBLY=$(shell mktemp))
 	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
 	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
 	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
 	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
 	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
 	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
 	@$(AS) $(TEMP_ASSEMBLY) -o $@
 	@rm -f ${TEMP_ASSEMBLY}
 endif
 endif # LLAMA_METAL
 ifdef LLAMA_MPI
@ -506,16 +568,23 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
-# combine build flags with cmdline overrides
+GF_CC := $(CC)
-override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
+include scripts/get-flags.mk
 override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
 override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
 override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
 override LDFLAGS       := $(MK_LDFLAGS) $(LDFLAGS)
-# save CXXFLAGS before we add host-only options
+# combine build flags with cmdline overrides
-NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
+override CPPFLAGS  := $(MK_CPPFLAGS) $(CPPFLAGS)
-override CXXFLAGS += $(HOST_CXXFLAGS)
+override CFLAGS    := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
 BASE_CXXFLAGS      := $(MK_CXXFLAGS) $(CXXFLAGS)
 override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
 override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
 # identify CUDA host compiler
 ifdef LLAMA_CUBLAS
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 #
 # Print build information
@ -529,8 +598,19 @@ $(info I CFLAGS:    $(CFLAGS))
 $(info I CXXFLAGS:  $(CXXFLAGS))
 $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
-$(info I CC:        $(shell $(CC) --version | head -n 1))
+$(info I CC:        $(shell $(CC)   --version | head -n 1))
-$(info I CXX:       $(shell $(CXX) --version | head -n 1))
+$(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef LLAMA_CUBLAS
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH
 $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
 endif # LLAMA_CUBLAS
 $(info )
 #
@ -575,92 +655,139 @@ train.o: common/train.cpp common/train.h
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 clean:
-	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 	find examples pocs -type f -name "*.o" -delete
 #
 # Examples
 #
 # $< is the first prerequisite, i.e. the source file.
 # Explicitly compile this to an object file so that it can be cached with ccache.
 # The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead.
 # Helper function that replaces .c, .cpp, and .cu file endings with .o:
 GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
 main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tokenize: examples/tokenize/tokenize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
-gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
 llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp  -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-ifdef LLAMA_METAL
+lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-metal: examples/metal/metal.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-endif
+
 lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
@ -668,7 +795,7 @@ swift: examples/batched.swift
 endif
 common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
-	@sh scripts/build-info.sh $(CC) > $@.tmp
+	@sh scripts/build-info.sh "$(CC)" > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
 		mv $@.tmp $@; \
 	else \
@ -685,7 +812,8 @@ build-info.o: common/build-info.cpp
 tests: $(TEST_TARGETS)
 benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 run-benchmark-matmult: benchmark-matmult
 	./$@
@ -693,46 +821,80 @@ run-benchmark-matmult: benchmark-matmult
 .PHONY: run-benchmark-matmult swift
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-chat-template: tests/test-chat-template.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -2,33 +2,14 @@
 import PackageDescription
 #if arch(arm) || arch(arm64)
 let platforms: [SupportedPlatform]? = [
    .macOS(.v12),
    .iOS(.v14),
    .watchOS(.v4),
    .tvOS(.v14)
 ]
 let exclude: [String] = []
 let resources: [Resource] = [
    .process("ggml-metal.metal")
 ]
 let additionalSources: [String] = ["ggml-metal.m"]
 let additionalSettings: [CSetting] = [
    .unsafeFlags(["-fno-objc-arc"]),
    .define("GGML_USE_METAL")
 ]
 #else
 let platforms: [SupportedPlatform]? = nil
 let exclude: [String] = ["ggml-metal.metal"]
 let resources: [Resource] = []
 let additionalSources: [String] = []
 let additionalSettings: [CSetting] = []
 #endif
 let package = Package(
    name: "llama",
-    platforms: platforms,
+    platforms: [
        .macOS(.v12),
        .iOS(.v14),
        .watchOS(.v4),
        .tvOS(.v14)
    ],
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
@ -36,25 +17,40 @@ let package = Package(
        .target(
            name: "llama",
            path: ".",
-            exclude: exclude,
+            exclude: [
               "cmake",
               "examples",
               "scripts",
               "models",
               "tests",
               "CMakeLists.txt",
               "ggml-cuda.cu",
               "ggml-cuda.h",
               "Makefile"
            ],
            sources: [
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "ggml-quants.c",
-            ] + additionalSources,
+                "ggml-metal.m",
-            resources: resources,
+            ],
            resources: [
                .process("ggml-metal.metal")
            ],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE")
+                .define("GGML_USE_ACCELERATE"),
                .unsafeFlags(["-fno-objc-arc"]),
                .define("GGML_USE_METAL"),
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
-            ] + additionalSettings,
+            ],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/README-sycl.md
+++ b/README-sycl.md
@ -0,0 +1,515 @@
 # llama.cpp for SYCL
 - [Background](#background)
 - [News](#news)
 - [OS](#os)
 - [Intel GPU](#intel-gpu)
 - [Docker](#docker)
 - [Linux](#linux)
 - [Windows](#windows)
 - [Environment Variable](#environment-variable)
 - [Known Issue](#known-issue)
 - [Q&A](#q&a)
 - [Todo](#todo)
 ## Background
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
 oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
 Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
 To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
 The llama.cpp for SYCL is used to support Intel GPUs.
 For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 ## News
 - 2024.3
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
  - Support detecting all GPUs with level-zero and same top **Max compute units**.
  - Support OPs
    - hardsigmoid
    - hardswish
    - pool2d
 - 2024.1
  - Create SYCL backend for Intel GPU.
  - Support Windows build
 ## OS
 |OS|Status|Verified|
 |-|-|-|
 |Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
 |Windows|Support|Windows 11|
 ## Intel GPU
 ### Verified
 |Intel GPU| Status | Verified Model|
 |-|-|-|
 |Intel Data Center Max Series| Support| Max 1550|
 |Intel Data Center Flex Series| Support| Flex 170|
 |Intel Arc Series| Support| Arc 770, 730M|
 |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
 |Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
 Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
 ### Memory
 The memory is a limitation to run LLM on GPUs.
 When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors:            buffer size =  3577.56 MiB`.
 For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
 For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
 ## Docker
 Note:
 - Only docker on Linux is tested. Docker on WSL may not work.
 - You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
 ### Build the image
 You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
 ```sh
 # For F16:
 #docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
 # Or, for F32:
 docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
 # Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
 ```
 ### Run
 ```sh
 # Firstly, find all the DRI cards:
 ls -la /dev/dri
 # Then, pick the card that you want to use.
 # For example with "/dev/dri/card1"
 docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
 ## Linux
 ### Setup Environment
 1. Install Intel GPU driver.
 a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
 Note: for iGPU, please install the client GPU driver.
 b. Add user to group: video, render.
 ```sh
 sudo usermod -aG render username
 sudo usermod -aG video username
 ```
 Note: re-login to enable it.
 c. Check
 ```sh
 sudo apt install clinfo
 sudo clinfo -l
 ```
 Output (example):
 ```
 Platform #0: Intel(R) OpenCL Graphics
 `-- Device #0: Intel(R) Arc(TM) A770 Graphics
 Platform #0: Intel(R) OpenCL HD Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```
 2. Install Intel® oneAPI Base toolkit.
 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
 Recommend to install to default folder: **/opt/intel/oneapi**.
 Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
 b. Check
 ```sh
 source /opt/intel/oneapi/setvars.sh
 sycl-ls
 ```
 There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
 Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
 ```
 2. Build locally:
 Note:
 - You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
 - By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
 ```sh
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh
 # For FP16:
 #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
 # Or, for FP32:
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 # Build example/main only
 #cmake --build . --config Release --target main
 # Or, build all binary
 cmake --build . --config Release -v
 cd ..
 ```
 or
 ```sh
 ./examples/sycl/build.sh
 ```
 ### Run
 1. Put model file to folder **models**
 You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
 2. Enable oneAPI running environment
 ```
 source /opt/intel/oneapi/setvars.sh
 ```
 3. List device ID
 Run without parameter:
 ```sh
 ./build/bin/ls-sycl-device
 # or running the "main" executable and look at the output log:
 ./build/bin/main
 ```
 Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
 4. Set device ID and execute llama.cpp
 Set device ID = 0 by **GGML_SYCL_DEVICE=0**
 ```sh
 GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
 or run by script:
 ```sh
 ./examples/sycl/run_llama2.sh
 ```
 Note:
 - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
 5. Check the device ID in output
 Like:
 ```
 Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 ```
 ## Windows
 ### Setup Environment
 1. Install Intel GPU driver.
 Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
 Note: **The driver is mandatory for compute function**.
 2. Install Visual Studio.
 Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
 3. Install Intel® oneAPI Base toolkit.
 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
 Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
 Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
 b. Enable oneAPI running environment:
 - In Search, input 'oneAPI'.
 Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
 - In Run:
 In CMD:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```
 c. Check GPU
 In oneAPI command line:
 ```
 sycl-ls
 ```
 There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
 Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO  [31.0.101.5186]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
 ```
 4. Install cmake & make
 a. Download & install cmake for Windows: https://cmake.org/download/
 b. Download & install mingw-w64 make for Windows provided by w64devkit
 - Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
 - Extract `w64devkit` on your pc.
 - Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`.
 ### Build locally:
 In oneAPI command line window:
 ```
 mkdir -p build
 cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 ::  for FP16
 ::  faster for long-prompt inference
 ::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 ::  for FP32
 cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
 ::  build example/main only
 ::  make main
 ::  build all binary
 make -j
 cd ..
 ```
 or
 ```
 .\examples\sycl\win-build-sycl.bat
 ```
 Note:
 - By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
 ### Run
 1. Put model file to folder **models**
 You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
 2. Enable oneAPI running environment
 - In Search, input 'oneAPI'.
 Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
 - In Run:
 In CMD:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```
 3. List device ID
 Run without parameter:
 ```
 build\bin\ls-sycl-device.exe
 or
 build\bin\main.exe
 ```
 Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
 4. Set device ID and execute llama.cpp
 Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
 ```
 set GGML_SYCL_DEVICE=0
 build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
 ```
 or run by script:
 ```
 .\examples\sycl\win-run-llama2.bat
 ```
 Note:
 - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
 5. Check the device ID in output
 Like:
 ```
 Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 ```
 ## Environment Variable
 #### Build
 |Name|Value|Function|
 |-|-|-|
 |LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
 |LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
 |CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
 |CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
 #### Running
 |Name|Value|Function|
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 |ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
 ## Known Issue
 - Hang during startup
  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
  Solution: add **--no-mmap** or **--mmap 0**.
 - Split-mode: [row] is not supported
  It's on developing.
 ## Q&A
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
  Miss to enable oneAPI running environment.
  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
 - In Windows, no result, not error.
  Miss to enable oneAPI running environment.
 - Meet compile error.
  Remove folder **build** and try again.
 - I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
  Please run **sudo sycl-ls**.
  If you see it in result, please add video/render group to your ID:
  ```
  sudo usermod -aG render username
  sudo usermod -aG video username
  ```
  Then **relogin**.
  If you do not see it, please check the installation GPU steps again.
 ## Todo
 - Support multiple cards.
--- a/README.md
+++ b/README.md
@ -6,11 +6,21 @@
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
-Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
+Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 ### Recent API changes
 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
 ### Hot topics
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
+- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
 - Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 - Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
 - Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 ----
@ -26,17 +36,14 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
        <li><a href="#get-the-code">Get the Code</a></li>
        <li><a href="#build">Build</a></li>
        <li><a href="#blas-build">BLAS Build</a></li>
-        <li><a href="#prepare-data--run">Prepare Data & Run</a></li>
+        <li><a href="#prepare-and-quantize">Prepare and Quantize</a></li>
        <li><a href="#run-the-quantized-model">Run the quantized model</a></li>
        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
        <li><a href="#quantization">Quantization</a></li>
        <li><a href="#interactive-mode">Interactive mode</a></li>
        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
-        <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
+        <li><a href="#instruct-mode">Instruct mode</a></li>
-        <li><a href="#using-openllama">Using OpenLLaMA</a></li>
+        <li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
        <li><a href="#using-gpt4all">Using GPT4All</a></li>
        <li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
        <li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
        <li><a href="#verifying-the-model-files">Verifying the model files</a></li>
        <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
        <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
        <li><a href="#android">Android</a></li>
@ -51,18 +58,20 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ## Description
-The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook
+The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
 variety of hardware - locally and in the cloud.
- Plain C/C++ implementation without dependencies
+- Plain C/C++ implementation without any dependencies
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
+- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2 and AVX512 support for x86 architectures
- Mixed F16 / F32 precision
+- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
+- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- CUDA, Metal and OpenCL GPU backend support
+- Vulkan, SYCL, and (partial) OpenCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
-The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
+Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
-Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
+improved significantly thanks to many contributions. It is the main playground for developing new features for the
-as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
+[ggml](https://github.com/ggerganov/ggml) library.
 **Supported platforms:**
@ -70,56 +79,101 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] Linux
 - [X] Windows (via CMake)
 - [X] Docker
 - [X] FreeBSD
 **Supported models:**
 Typically finetunes of the base models below are supported as well.
 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [X] Falcon
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
 - [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
 - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
 - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
+- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM models](https://huggingface.co/stabilityai)
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
 - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
 - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
 - [x] [GPT-2](https://huggingface.co/gpt2)
 - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
 **Multimodal models:**
 - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
 - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
 - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
 **HTTP server**
 [llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
 **Bindings:**
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
+- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 **UI:**
 Unless otherwise noted these projects are open-source with permissive licensing:
 - [iohub/collama](https://github.com/iohub/coLLaMA)
 - [janhq/jan](https://github.com/janhq/jan) (AGPL)
 - [nat/openplayground](https://github.com/nat/openplayground)
- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
+- [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
 - [ollama/ollama](https://github.com/ollama/ollama)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
 - [Msty](https://msty.app) (proprietary)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
 ---
 Here is a typical run using LLaMA v2 13B on M2 Ultra:
-```java
+```
 $ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
 I llama.cpp build info:
 I UNAME_S:  Darwin
@ -203,7 +257,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
 ## Usage
-Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model.
+Here are the end-to-end binary build and model conversion steps for most supported models.
 ### Get the Code
@ -264,7 +318,7 @@ In order to build llama.cpp you have three different options.
        sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
            opencl clblast openblas
-            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
+        gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
        ```
    **Notes:** With this packages you can build llama.cpp with OPENBLAS and
@ -320,7 +374,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ### BLAS Build
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
 - #### Accelerate Framework:
@ -364,20 +418,37 @@ Building the program with BLAS support may lead to some performance improvements
  Check [BLIS.md](docs/BLIS.md) for more information.
- #### Intel MKL
+- #### SYCL
  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
-  By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
+  llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
-  ```bash
+  For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
-  mkdir build
+
-  cd build
+- #### Intel oneMKL
-  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
-  cmake --build . --config Release
+
-  ```
+  - Using manual oneAPI installation:
    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
      ```bash
      mkdir build
      cd build
      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
      cmake --build . --config Release
      ```
  - Using oneAPI docker image:
    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
 - #### cuBLAS
  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
  - Using `make`:
    ```bash
    make LLAMA_CUBLAS=1
@ -415,25 +486,34 @@ Building the program with BLAS support may lead to some performance improvements
    ```bash
    make LLAMA_HIPBLAS=1
    ```
-  - Using `CMake` for Linux:
+  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
    ```bash
-    mkdir build
+    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
-    cd build
+        cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
+        && cmake --build build -- -j 16
    cmake --build .
    ```
-  - Using `CMake` for Windows:
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
  - Using `make` (example for target gfx1030, build with 16 CPU threads):
    ```bash
    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
    ```
  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
    ```bash
    set PATH=%HIP_PATH%\bin;%PATH%
    mkdir build
    cd build
    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
    cmake --build .
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
    Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
-  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
+  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
  | Option                  | Legal values           | Default | Description |
@ -546,34 +626,87 @@ Building the program with BLAS support may lead to some performance improvements
  You can get a list of platforms and devices from the `clinfo -l` command, etc.
-### Prepare Data & Run
+- #### Vulkan
  **With docker**:
  You don't need to install Vulkan SDK. It will be installed inside the container.
  ```sh
  # Build the image
  docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
  # Then, use it:
  docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
  ```
  **Without docker**:
  Firstly, you need to make sure you have installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
  For example, on Ubuntu 22.04 (jammy), use the command below:
  ```bash
  wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
  wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
  apt update -y
  apt-get install -y vulkan-sdk
  # To verify the installation, use the command below:
  vulkaninfo
  ```
  Alternatively your package manager might be able to provide the appropiate libraries. For example for Ubuntu 22.04 you can install `libvulkan-dev` instead.
  Then, build llama.cpp using the cmake command below:
  ```bash
  mkdir -p build
  cd build
  cmake .. -DLLAMA_VULKAN=1
  cmake --build . --config Release
  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
  ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
  # You should see in the output, ggml_vulkan detected your GPU. For example:
  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
  ```
 ### Prepare and Quantize
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
 ```bash
-# obtain the original LLaMA model weights and place them in ./models
+# obtain the official LLaMA model weights and place them in ./models
 ls ./models
-65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
+llama-2-7b tokenizer_checklist.chk tokenizer.model
-  # [Optional] for models using BPE tokenizers
+# [Optional] for models using BPE tokenizers
-  ls ./models
+ls ./models
-  65B 30B 13B 7B vocab.json
+<folder containing weights and tokenizer json> vocab.json
 # [Optional] for PyTorch .bin models like Mistral-7B
 ls ./models
 <folder containing weights and tokenizer json>
 # install Python dependencies
 python3 -m pip install -r requirements.txt
-# convert the 7B model to ggml FP16 format
+# convert the model to ggml FP16 format
-python3 convert.py models/7B/
+python3 convert.py models/mymodel/
-  # [Optional] for models using BPE tokenizers
+# [Optional] for models using BPE tokenizers
-  python convert.py models/7B/ --vocabtype bpe
+python convert.py models/mymodel/ --vocab-type bpe
-# quantize the model to 4-bits (using q4_0 method)
+# quantize the model to 4-bits (using Q4_K_M method)
-./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
+./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
-# update the gguf filetype to current if older version is unsupported by another application
+# update the gguf filetype to current version if older version is now unsupported
-./quantize ./models/7B/ggml-model-q4_0.gguf ./models/7B/ggml-model-q4_0-v2.gguf COPY
+./quantize ./models/mymodel/ggml-model-Q4_K_M.gguf ./models/mymodel/ggml-model-Q4_K_M-v2.gguf COPY
 ```
 ### Run the quantized model
-# run the inference
+```bash
-./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
+# start inference on a gguf model
 ./main -m ./models/mymodel/ggml-model-Q4_K_M.gguf -n 128
 ```
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@ -594,7 +727,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
-| Model | Original size | Quantized size (4-bit) |
+| Model | Original size | Quantized size (Q4_0) |
 |------:|--------------:|-----------------------:|
 |    7B |         13 GB |                 3.9 GB |
 |   13B |         24 GB |                 7.8 GB |
@ -621,9 +754,21 @@ Several quantization methods are supported. They differ in the resulting model d
 |   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
 - [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
- recent k-quants improvements
+- recent k-quants improvements and new i-quants
  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
  - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
 ### Perplexity (measuring model quality)
@ -635,7 +780,7 @@ The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 thread
 #### How to run
-1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+1. Download/extract: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
@ -648,7 +793,7 @@ And after 4.45 hours, you will have the final perplexity.
 ### Interactive mode
 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
-In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
 Here is an example of a few-shot interaction, invoked with the command
@ -698,9 +843,9 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,
 For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
-### Instruction mode with Alpaca
+### Instruct mode
-1. First, download the `ggml` Alpaca model into the `./models` folder
+1. First, download and place the `ggml` model into the `./models` folder
 2. Run the `main` tool like this:
 ```
@ -712,7 +857,7 @@ Sample run:
 ```
 == Running in interactive mode. ==
 - Press Ctrl+C to interject at any time.
- - Press Return to return control to LLaMa.
+ - Press Return to return control to LLaMA.
 - If you want to submit another line, end your input in '\'.
 Below is an instruction that describes a task. Write a response that appropriately completes the request.
@ -726,50 +871,6 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 >
 ```
 ### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
 OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
 - Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
 - Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
 *Note: these instructions are likely obsoleted by the GGUF update*
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
 - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
 - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
 - It is distributed in the old `ggml` format which is now obsoleted
 - You have to convert it to the new format using `convert.py`:
 ```bash
 python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin
 ```
 - You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models
 - The newer GPT4All-J model is not yet supported!
 ### Using Pygmalion 7B & Metharme 7B
 - Obtain the [LLaMA weights](#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data)
 - Obtain the [Pygmalion 7B](https://huggingface.co/PygmalionAI/pygmalion-7b/) or [Metharme 7B](https://huggingface.co/PygmalionAI/metharme-7b) XOR encoded weights
 - Convert the LLaMA model with [the latest HF convert script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py)
 - Merge the XOR files with the converted LLaMA weights by running the [xor_codec](https://huggingface.co/PygmalionAI/pygmalion-7b/blob/main/xor_codec.py) script
 - Convert to `ggml` format using the `convert.py` script in this repo:
 ```bash
 python3 convert.py pygmalion-7b/ --outtype q4_1
 ```
 > The Pygmalion 7B & Metharme 7B weights are saved in [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) precision. If you wish to convert to `ggml` without quantizating, please specify the `--outtype` as `f32` instead of `f16`.
 ### Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
 - **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.**
 - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
 - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
 ### Obtaining and using the Facebook LLaMA 2 model
 - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
@ -781,20 +882,6 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
 ### Verifying the model files
 Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
 - The following python script will verify if you have all possible latest files in your self-installed `./models` subdirectory:
 ```bash
 # run the verification script
 ./scripts/verify-checksum-models.py
 ```
 - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
    - On Linux: `sha256sum --ignore-missing -c SHA256SUMS`
    - on macOS: `shasum -a 256 --ignore-missing -c SHA256SUMS`
 ### Seminal papers and background on the models
 If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
@ -879,19 +966,22 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
 * Create a folder to store big models & intermediate files (ex. /llama/models)
 #### Images
-We have two Docker images available for this project:
+We have three Docker images available for this project:
 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
 3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
 Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
 #### Usage
@ -915,6 +1005,12 @@ or with a light image:
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```
 or with a server image:
 ```bash
 docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
 ```
 ### Docker With CUDA
 Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
@ -924,6 +1020,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
 ```bash
 docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
 docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
 docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
 ```
 You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@ -937,6 +1034,7 @@ The resulting images, are essentially the same as the non-CUDA images:
 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
 3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
 #### Usage
@ -945,6 +1043,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
 ```bash
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
 ```
 ### Contributing
@ -964,6 +1063,8 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
 - Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
 ### Docs
--- a/40
+++ b/40
@ -1,40 +0,0 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
 ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf  models/7B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
 fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5  models/13B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
 d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d  models/30B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_1.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
 e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/consolidated.02.pth
 73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e  models/65B/consolidated.03.pth
 882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225  models/65B/consolidated.04.pth
 a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
 cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92  models/65B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_1.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
--- a/build.zig
+++ b/build.zig
@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");
    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/README.md
+++ b/ci/README.md
@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with CUDA support
 GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with SYCL support
 source /opt/intel/oneapi/setvars.sh
 GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
--- a/ci/run.sh
+++ b/ci/run.sh
@ -10,6 +10,9 @@
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -22,14 +25,33 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")
-rm -v $OUT/*.log
+rm -f "$OUT/*.log"
-rm -v $OUT/*.exit
+rm -f "$OUT/*.exit"
-rm -v $OUT/*.md
+rm -f "$OUT/*.md"
 sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
 fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
 fi
 ## helpers
 # download a file if it does not exist or if it is outdated
@ -81,10 +103,10 @@ function gg_run_ctest_debug {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
-    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
 }
@ -109,13 +131,13 @@ function gg_run_ctest_release {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
-        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi
    set +e
@ -131,6 +153,61 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }
 function gg_get_model {
    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
    if [[ -s $gguf_3b ]]; then
        echo -n "$gguf_3b"
    elif [[ -s $gguf_7b ]]; then
        echo -n "$gguf_7b"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
    fi
 }
 function gg_run_ctest_with_model_debug {
    cd ${SRC}
    local model; model=$(gg_get_model)
    cd build-ci-debug
    set -e
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
    cd ..
 }
 function gg_run_ctest_with_model_release {
    cd ${SRC}
    local model; model=$(gg_get_model)
    cd build-ci-release
    set -e
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
    set +e
    cd ..
 }
 function gg_sum_ctest_with_model_debug {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest with model files in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
 }
 function gg_sum_ctest_with_model_release {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs ctest with model files in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
 }
 # open_llama_3b_v2
 function gg_run_open_llama_3b_v2 {
@ -143,7 +220,7 @@ function gg_run_open_llama_3b_v2 {
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
@ -154,8 +231,8 @@ function gg_run_open_llama_3b_v2 {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert.py ${path_models}
@ -196,17 +273,19 @@ function gg_run_open_llama_3b_v2 {
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
@ -235,6 +314,8 @@ function gg_run_open_llama_3b_v2 {
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    # lora
    function compare_ppl {
        qnt="$1"
@ -263,20 +344,19 @@ function gg_run_open_llama_3b_v2 {
    python3 ../convert-lora-to-ggml.py ${path_lora}
    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    set +e
 }
@ -286,6 +366,7 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf 'OpenLLaMA 3B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
@ -321,7 +402,7 @@ function gg_run_open_llama_7b_v2 {
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    path_models="../models-mnt/open-llama/7B-v2"
@ -331,8 +412,8 @@ function gg_run_open_llama_7b_v2 {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert.py ${path_models}
@ -385,6 +466,8 @@ function gg_run_open_llama_7b_v2 {
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
@ -412,6 +495,8 @@ function gg_run_open_llama_7b_v2 {
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    # lora
    function compare_ppl {
        qnt="$1"
@ -463,6 +548,7 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf 'OpenLLaMA 7B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
@ -483,17 +569,69 @@ function gg_sum_open_llama_7b_v2 {
    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }
 # bge-small
 function gg_run_embd_bge_small {
    cd ${SRC}
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
    gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
    path_models="../models-mnt/bge-small"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert-hf-to-gguf.py ${path_models}
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    (time ./bin/embedding --model ${model_f16}  -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    set +e
 }
 function gg_sum_embd_bge_small {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'BGE Small (BERT):\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
 }
 ## main
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt
-    python3 -m pip install -r ${SRC}/requirements.txt
+    # Create a fresh python3 venv and enter it
-    python3 -m pip install --editable gguf-py
+    python3 -m venv "$MNT/venv"
    source "$MNT/venv/bin/activate"
    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
    pip install --editable gguf-py --disable-pip-version-check
 fi
 ret=0
@ -502,12 +640,16 @@ test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run open_llama_3b_v2
        else
            test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -11,10 +11,20 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
    if(NOT IS_DIRECTORY "${GIT_DIR}")
        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+        string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
        if (SLASH_POS EQUAL 0)
            set(GIT_DIR "${REAL_GIT_DIR}")
        else()
            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
        endif()
    endif()
-    set(GIT_INDEX "${GIT_DIR}/index")
+    if(EXISTS "${GIT_DIR}/index")
        set(GIT_INDEX "${GIT_DIR}/index")
    else()
        message(WARNING "Git index not found in git repository.")
        set(GIT_INDEX "")
    endif()
 else()
    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
    set(GIT_INDEX "")
@ -26,7 +36,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
@ -60,4 +70,4 @@ endif()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama build_info)
+target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
--- a/common/common.cpp
+++ b/common/common.cpp
@ -42,6 +42,14 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
 #define GGML_USE_CUBLAS_SYCL
 #endif
 #if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
 #define GGML_USE_CUBLAS_SYCL_VULKAN
 #endif
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
@ -167,6 +175,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            if (params.n_threads_batch <= 0) {
                params.n_threads_batch = std::thread::hardware_concurrency();
            }
        } else if (arg == "-td" || arg == "--threads-draft") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_threads_draft = std::stoi(argv[i]);
            if (params.n_threads_draft <= 0) {
                params.n_threads_draft = std::thread::hardware_concurrency();
            }
        } else if (arg == "-tbd" || arg == "--threads-batch-draft") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_threads_batch_draft = std::stoi(argv[i]);
            if (params.n_threads_batch_draft <= 0) {
                params.n_threads_batch_draft = std::thread::hardware_concurrency();
            }
        } else if (arg == "-p" || arg == "--prompt") {
            if (++i >= argc) {
                invalid_param = true;
@ -185,6 +211,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.prompt_cache_all = true;
        } else if (arg == "--prompt-cache-ro") {
            params.prompt_cache_ro = true;
        } else if (arg == "-bf" || arg == "--binary-file") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::ifstream file(argv[i], std::ios::binary);
            if (!file) {
                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                invalid_param = true;
                break;
            }
            // store the external file name in params
            params.prompt_file = argv[i];
            std::ostringstream ss;
            ss << file.rdbuf();
            params.prompt = ss.str();
            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@ -220,6 +263,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
        } else if (arg == "--grp-attn-n" || arg == "-gan") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.grp_attn_n = std::stoi(argv[i]);
        } else if (arg == "--grp-attn-w" || arg == "-gaw") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.grp_attn_w = std::stoi(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@ -238,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
+            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
+            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
            else { invalid_param = true; break; }
        } else if (arg == "--rope-scale") {
            if (++i >= argc) {
@ -278,8 +335,35 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--memory-f32") {
+        } else if (arg == "--pooling") {
-            params.memory_f16 = false;
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::string value(argv[i]);
            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
            else { invalid_param = true; break; }
        } else if (arg == "--defrag-thold" || arg == "-dt") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.defrag_thold = std::stof(argv[i]);
        } else if (arg == "--samplers") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            const auto sampler_names = string_split(argv[i], ';');
            sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
        } else if (arg == "--sampling-seq") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
@ -336,6 +420,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            sparams.penalty_present = std::stof(argv[i]);
        } else if (arg == "--dynatemp-range") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            sparams.dynatemp_range = std::stof(argv[i]);
        } else if (arg == "--dynatemp-exp") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            sparams.dynatemp_exponent = std::stof(argv[i]);
        } else if (arg == "--mirostat") {
            if (++i >= argc) {
                invalid_param = true;
@ -417,12 +513,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_sequences = std::stoi(argv[i]);
        } else if (arg == "--p-accept" || arg == "-pa") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.p_accept = std::stof(argv[i]);
        } else if (arg == "--p-split" || arg == "-ps") {
            if (++i >= argc) {
                invalid_param = true;
@ -452,7 +542,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
+            params.lora_adapter.emplace_back(argv[i], 1.0f);
            params.use_mmap = false;
        } else if (arg == "--lora-scaled") {
            if (++i >= argc) {
@ -464,7 +554,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
+            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
            params.use_mmap = false;
        } else if (arg == "--lora-base") {
            if (++i >= argc) {
@ -498,6 +588,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.infill = true;
        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
            params.dump_kv_cache = true;
        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
            params.no_kv_offload = true;
        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            params.cache_type_k = argv[++i];
        } else if (arg == "-ctv" || arg == "--cache-type-v") {
            params.cache_type_v = argv[++i];
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@ -513,75 +609,101 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
            params.n_gpu_layers = std::stoi(argv[i]);
-#else
+            if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
+            }
        } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
            params.n_gpu_layers_draft = std::stoi(argv[i]);
-#else
+            if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
+            }
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
 #ifdef GGML_USE_CUBLAS
            params.main_gpu = std::stoi(argv[i]);
-#else
+#ifndef GGML_USE_CUBLAS_SYCL
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
-#endif
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--split-mode" || arg == "-sm") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::string arg_next = argv[i];
            if (arg_next == "none") {
                params.split_mode = LLAMA_SPLIT_MODE_NONE;
            } else if (arg_next == "layer") {
                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
            } else if (arg_next == "row") {
 #ifdef GGML_USE_SYCL
                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
                exit(1);
 #endif // GGML_USE_SYCL
                params.split_mode = LLAMA_SPLIT_MODE_ROW;
            } else {
                invalid_param = true;
                break;
            }
 #ifndef GGML_USE_CUBLAS_SYCL
            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
 #endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
 #ifdef GGML_USE_CUBLAS
            std::string arg_next = argv[i];
            // split string by , and /
            const std::regex regex{R"([,/]+)"};
            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
            std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+            if (split_arg.size() >= llama_max_devices()) {
-
+                invalid_param = true;
-            for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+                break;
            }
            for (size_t i = 0; i < llama_max_devices(); ++i) {
                if (i < split_arg.size()) {
                    params.tensor_split[i] = std::stof(split_arg[i]);
                } else {
                    params.tensor_split[i] = 0.0f;
                }
            }
-#else
+#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUBLAS_SYCL
        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
 #ifdef GGML_USE_CUBLAS
            params.mul_mat_q = false;
 #else
            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--numa") {
-            params.numa = true;
+            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::string value(argv[i]);
            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
            else { invalid_param = true; break; }
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "--no-display-prompt") {
            params.display_prompt = false;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.antiprompt.push_back(argv[i]);
+            params.antiprompt.emplace_back(argv[i]);
        } else if (arg == "-ld" || arg == "--logdir") {
            if (++i >= argc) {
                invalid_param = true;
@ -592,6 +714,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.logits_file = argv[i];
        } else if (arg == "--perplexity" || arg == "--all-logits") {
            params.logits_all = true;
        } else if (arg == "--ppl-stride") {
@ -600,6 +728,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.ppl_stride = std::stoi(argv[i]);
        } else if (arg == "-ptc" || arg == "--print-token-count") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_print = std::stoi(argv[i]);
        } else if (arg == "--ppl-output-type") {
            if (++i >= argc) {
                invalid_param = true;
@ -614,6 +748,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.hellaswag_tasks = std::stoi(argv[i]);
        } else if (arg == "--winogrande") {
            params.winogrande = true;
        } else if (arg == "--winogrande-tasks") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.winogrande_tasks = std::stoi(argv[i]);
        } else if (arg == "--multiple-choice") {
            params.multiple_choice = true;
        } else if (arg == "--multiple-choice-tasks") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.multiple_choice_tasks = std::stoi(argv[i]);
        } else if (arg == "--kl-divergence") {
            params.kl_divergence = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
@ -640,6 +792,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "-h" || arg == "--help") {
            return false;
        } else if (arg == "--version") {
            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@ -678,6 +834,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                std::istreambuf_iterator<char>(),
                std::back_inserter(sparams.grammar)
            );
        } else if (arg == "--override-kv") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            char * sep = strchr(argv[i], '=');
            if (sep == nullptr || sep - argv[i] >= 128) {
                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
                invalid_param = true;
                break;
            }
            struct llama_model_kv_override kvo;
            std::strncpy(kvo.key, argv[i], sep - argv[i]);
            kvo.key[sep - argv[i]] = 0;
            sep++;
            if (strncmp(sep, "int:", 4) == 0) {
                sep += 4;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
                kvo.int_value = std::atol(sep);
            } else if (strncmp(sep, "float:", 6) == 0) {
                sep += 6;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
                kvo.float_value = std::atof(sep);
            } else if (strncmp(sep, "bool:", 5) == 0) {
                sep += 5;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
                if (std::strcmp(sep, "true") == 0) {
                    kvo.bool_value = true;
                } else if (std::strcmp(sep, "false") == 0) {
                    kvo.bool_value = false;
                } else {
                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
                    invalid_param = true;
                    break;
                }
            } else {
                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
                invalid_param = true;
                break;
            }
            params.kv_overrides.push_back(kvo);
 #ifndef LOG_DISABLE_LOGS
        // Parse args for logging parameters
        } else if ( log_param_single_parse( argv[i] ) ) {
@ -721,17 +918,31 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        }
    }
    if (!params.kv_overrides.empty()) {
        params.kv_overrides.emplace_back();
        params.kv_overrides.back().key[0] = 0;
    }
    return true;
 }
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const llama_sampling_params & sparams = params.sparams;
    std::string sampler_type_chars;
    std::string sampler_type_names;
    for (const auto sampler_type : sparams.samplers_sequence) {
        sampler_type_chars += static_cast<char>(sampler_type);
        sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
    }
    sampler_type_names.pop_back();
    printf("\n");
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
    printf("options:\n");
    printf("  -h, --help            show this help message and exit\n");
    printf("  --version             show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
@ -745,6 +956,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N\n");
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
    printf("  -td N, --threads-draft N");
    printf("                        number of threads to use during generation (default: same as --threads)\n");
    printf("  -tbd N, --threads-batch-draft N\n");
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
    printf("  -p PROMPT, --prompt PROMPT\n");
    printf("                        prompt to start generation with (default: empty)\n");
    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
@ -758,9 +973,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
    printf("  -bf FNAME, --binary-file FNAME\n");
    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
    printf("                        (default: %s)\n", sampler_type_names.c_str());
    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@ -770,6 +990,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
    printf("  --dynatemp-range N    dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
    printf("  --dynatemp-exp N      dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
    printf("  --mirostat N          use Mirostat sampling.\n");
    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@ -796,50 +1018,71 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  --pooling {none,mean,cls}\n");
    printf("                        pooling type for embeddings, use model default if unspecified\n");
    printf("  -dt N, --defrag-thold N\n");
    printf("                        KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
-    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
+    printf("  --all-logits          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base\n");
    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
    printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
-    if (llama_mlock_supported()) {
+    if (llama_supports_mlock()) {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
-    if (llama_mmap_supported()) {
+    if (llama_supports_mmap()) {
        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    printf("  --numa                attempt optimizations that help on some NUMA systems\n");
+    printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
    printf("                          - distribute: spread execution evenly over all nodes\n");
    printf("                          - isolate: only spawn threads on CPUs on the node that execution started on\n");
    printf("                          - numactl: use the CPU map provided by numactl\n");
    printf("                        if run without this previously, it is recommended to drop the system page cache before using this\n");
    printf("                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+    if (llama_supports_gpu_offload()) {
-    printf("  -ngl N, --n-gpu-layers N\n");
+        printf("  -ngl N, --n-gpu-layers N\n");
-    printf("                        number of layers to store in VRAM\n");
+        printf("                        number of layers to store in VRAM\n");
-    printf("  -ngld N, --n-gpu-layers-draft N\n");
+        printf("  -ngld N, --n-gpu-layers-draft N\n");
-    printf("                        number of layers to store in VRAM for the draft model\n");
+        printf("                        number of layers to store in VRAM for the draft model\n");
-    printf("  -ts SPLIT --tensor-split SPLIT\n");
+        printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+        printf("                        how to split the model across multiple GPUs, one of:\n");
-    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+        printf("                          - none: use one GPU only\n");
-#ifdef GGML_USE_CUBLAS
+        printf("                          - layer (default): split layers and KV across GPUs\n");
-    printf("  -nommq, --no-mul-mat-q\n");
+        printf("                          - row: split rows across GPUs\n");
-    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
+        printf("  -ts SPLIT, --tensor-split SPLIT\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+        printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
-#endif // GGML_USE_CUBLAS
+        printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
-#endif
+        printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
-    printf("  --verbose-prompt      print prompt before generation\n");
+    }
    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
    printf("  -gan N, --grp-attn-n N\n");
    printf("                        group-attention factor (default: %d)\n", params.grp_attn_n);
    printf("  -gaw N, --grp-attn-w N\n");
    printf("                        group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
    printf("  -dkvc, --dump-kv-cache\n");
    printf("                        verbose print of the KV cache\n");
    printf("  -nkvo, --no-kv-offload\n");
    printf("                        disable KV offload\n");
    printf("  -ctk TYPE, --cache-type-k TYPE\n");
    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
    printf("  -ctv TYPE, --cache-type-v TYPE\n");
    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@ -847,9 +1090,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
+    printf("                        draft model for speculative decoding\n");
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -ptc N, --print-token-count N\n");
    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@ -886,6 +1134,104 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    GGML_UNREACHABLE();
 }
 //
 // String utils
 //
 std::vector<std::string> string_split(std::string input, char separator) {
    std::vector<std::string> parts;
    size_t separator_pos = input.find(separator);
    while (separator_pos != std::string::npos) {
        std::string part = input.substr(0, separator_pos);
        parts.emplace_back(part);
        input = input.substr(separator_pos + 1);
        separator_pos = input.find(separator);
    }
    parts.emplace_back(input);
    return parts;
 }
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
        {"top_k",       llama_sampler_type::TOP_K},
        {"top_p",       llama_sampler_type::TOP_P},
        {"typical_p",   llama_sampler_type::TYPICAL_P},
        {"min_p",       llama_sampler_type::MIN_P},
        {"tfs_z",       llama_sampler_type::TFS_Z},
        {"temperature", llama_sampler_type::TEMPERATURE}
    };
    // since samplers names are written multiple ways
    // make it ready for both system names and input names
    std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
        {"top-k",       llama_sampler_type::TOP_K},
        {"top-p",       llama_sampler_type::TOP_P},
        {"nucleus",     llama_sampler_type::TOP_P},
        {"typical-p",   llama_sampler_type::TYPICAL_P},
        {"typical",     llama_sampler_type::TYPICAL_P},
        {"min-p",       llama_sampler_type::MIN_P},
        {"tfs-z",       llama_sampler_type::TFS_Z},
        {"tfs",         llama_sampler_type::TFS_Z},
        {"temp",        llama_sampler_type::TEMPERATURE}
    };
    std::vector<llama_sampler_type> sampler_types;
    sampler_types.reserve(names.size());
    for (const auto & name : names)
    {
        auto sampler_item = sampler_canonical_name_map.find(name);
        if (sampler_item != sampler_canonical_name_map.end())
        {
            sampler_types.push_back(sampler_item->second);
        }
        else
        {
            if (allow_alt_names)
            {
                sampler_item = sampler_alt_name_map.find(name);
                if (sampler_item != sampler_alt_name_map.end())
                {
                    sampler_types.push_back(sampler_item->second);
                }
            }
        }
    }
    return sampler_types;
 }
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
    std::unordered_map<char, llama_sampler_type> sampler_name_map {
        {'k', llama_sampler_type::TOP_K},
        {'p', llama_sampler_type::TOP_P},
        {'y', llama_sampler_type::TYPICAL_P},
        {'m', llama_sampler_type::MIN_P},
        {'f', llama_sampler_type::TFS_Z},
        {'t', llama_sampler_type::TEMPERATURE}
    };
    std::vector<llama_sampler_type> sampler_types;
    sampler_types.reserve(names_string.size());
    for (const auto & c : names_string) {
        const auto sampler_item = sampler_name_map.find(c);
        if (sampler_item != sampler_name_map.end()) {
            sampler_types.push_back(sampler_item->second);
        }
    }
    return sampler_types;
 }
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
    switch (sampler_type) {
        case llama_sampler_type::TOP_K:       return "top_k";
        case llama_sampler_type::TFS_Z:       return "tfs_z";
        case llama_sampler_type::TYPICAL_P:   return "typical_p";
        case llama_sampler_type::TOP_P:       return "top_p";
        case llama_sampler_type::MIN_P:       return "min_p";
        case llama_sampler_type::TEMPERATURE: return "temperature";
        default : return "";
    }
 }
 //
 // Model utils
 //
@ -897,25 +1243,57 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
        mparams.kv_overrides = params.kv_overrides.data();
    }
    return mparams;
 }
 static ggml_type kv_cache_type_from_str(const std::string & s) {
    if (s == "f32") {
        return GGML_TYPE_F32;
    }
    if (s == "f16") {
        return GGML_TYPE_F16;
    }
    if (s == "q8_0") {
        return GGML_TYPE_Q8_0;
    }
    if (s == "q4_0") {
        return GGML_TYPE_Q4_0;
    }
    if (s == "q4_1") {
        return GGML_TYPE_Q4_1;
    }
    if (s == "q5_0") {
        return GGML_TYPE_Q5_0;
    }
    if (s == "q5_1") {
        return GGML_TYPE_Q5_1;
    }
    throw std::runtime_error("Invalid cache type: " + s);
 }
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();
    cparams.n_ctx             = params.n_ctx;
    cparams.n_batch           = params.n_batch;
    cparams.n_parallel        = params.n_parallel;
    cparams.n_threads         = params.n_threads;
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    cparams.mul_mat_q         = params.mul_mat_q;
    cparams.seed              = params.seed;
    cparams.f16_kv            = params.memory_f16;
    cparams.logits_all        = params.logits_all;
-    cparams.embedding         = params.embedding;
+    cparams.embeddings        = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
    cparams.rope_freq_base    = params.rope_freq_base;
    cparams.rope_freq_scale   = params.rope_freq_scale;
@ -924,6 +1302,12 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_fast    = params.yarn_beta_fast;
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.pooling_type      = params.pooling_type;
    cparams.defrag_thold      = params.defrag_thold;
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
    return cparams;
 }
@ -1244,13 +1628,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
    fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
@ -1260,6 +1646,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
    fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
 #ifdef NDEBUG
    fprintf(stream, "debug: false\n");
@ -1336,7 +1723,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
+    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
@ -1348,9 +1735,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
    fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
@ -1375,21 +1760,22 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
+    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
-    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
+    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
 }
 //
@ -1434,7 +1820,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
            if (cs_curr[j] < 0) { continue; }
            if (seqs.find(cs_curr[j]) == seqs.end()) {
                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                seqs[cs_curr[j]] = seqs.size();
+                const size_t sz = seqs.size();
                seqs[cs_curr[j]] = sz;
            }
        }
        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
--- a/common/common.h
+++ b/common/common.h
@ -43,34 +43,43 @@ extern char const *LLAMA_BUILD_TARGET;
 int32_t get_num_physical_cores();
 struct gpt_params {
-    uint32_t seed                           = -1;    // RNG seed
+    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
-    int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_threads             = get_num_physical_cores();
-    int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_draft       = -1;
-    int32_t n_predict                       = -1;    // new tokens to predict
+    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_ctx                           = 512;   // context size
+    int32_t n_threads_batch_draft = -1;
-    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_predict             = -1;    // new tokens to predict
-    int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
+    int32_t n_ctx                 = 512;   // context size
-    int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
+    int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
+    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
-    int32_t n_parallel                      = 1;     // number of parallel sequences to decode
+    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
-    int32_t n_sequences                     = 1;     // number of sequences to decode
+    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
-    float   p_accept                        = 0.5f;  // speculative decoding accept probability
+    int32_t n_parallel            = 1;     // number of parallel sequences to decode
-    float   p_split                         = 0.1f;  // speculative decoding split probability
+    int32_t n_sequences           = 1;     // number of sequences to decode
-    int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
+    float   p_split               = 0.1f;  // speculative decoding split probability
-    int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
+    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
+    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
-    int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
+    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
-    float   rope_freq_base                  = 0.0f;  // RoPE base frequency
+    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
-    float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
+    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
-    float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
+    int32_t grp_attn_n            = 1;     // group-attention factor
-    float   yarn_attn_factor                = 1.0f;  // YaRN magnitude scaling factor
+    int32_t grp_attn_w            = 512;   // group-attention width
-    float   yarn_beta_fast                  = 32.0f; // YaRN low correction dim
+    int32_t n_print               = -1;    // print token count every n tokens (-1 = disabled)
-    float   yarn_beta_slow                  = 1.0f;  // YaRN high correction dim
+    float   rope_freq_base        = 0.0f;  // RoPE base frequency
-    int32_t yarn_orig_ctx                   = 0;     // YaRN original context length
+    float   rope_freq_scale       = 0.0f;  // RoPE frequency scaling factor
-    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
+    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-                                                                              //       pinging @cebtenzzre
+    float   yarn_attn_factor      = 1.0f;  // YaRN magnitude scaling factor
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    // // sampling parameters
    struct llama_sampling_params sparams;
@ -85,6 +94,9 @@ struct gpt_params {
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
    std::string logits_file       = "";  // file for saving *all* logits
    std::vector<llama_model_kv_override> kv_overrides;
    // TODO: avoid tuple, use struct
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
@ -97,8 +109,14 @@ struct gpt_params {
    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
-    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
+    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
+    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
    bool   kl_divergence   = false; // compute KL-divergence
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
@ -119,14 +137,18 @@ struct gpt_params {
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool display_prompt    = true;  // print prompt before generation
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
    // multimodal models (see examples/llava)
    std::string mmproj = ""; // path to multimodal projector
-    std::string image = ""; // path to an image file
+    std::string image  = ""; // path to an image file
 };
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
@ -141,6 +163,15 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 void process_escapes(std::string& input);
 //
 // String utils
 //
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
 //
 // Model utils
 //
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -190,7 +190,7 @@ namespace grammar_parser {
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                }
                // apply transformation to previous symbol (last_sym_start to end) according to
--- a/common/log.h
+++ b/common/log.h
@ -61,13 +61,13 @@
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
-//  The log target can also be redirected to a diffrent function
+//  The log target can also be redirected to a different function
 //  like so:
 //
-//  #define LOG_TARGET log_handler_diffrent()
+//  #define LOG_TARGET log_handler_different()
 //  #include "log.h"
 //
-//  FILE* log_handler_diffrent()
+//  FILE* log_handler_different()
 //  {
 //      return stderr;
 //  }
@ -297,7 +297,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 #ifndef _MSC_VER
    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
 #else
-    #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
+    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
 #endif
 // Main TEE macro.
@ -311,7 +311,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 #ifndef _MSC_VER
    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
 #else
-    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
+    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
 #endif
 // LOG macro variants with auto endline.
@ -319,8 +319,8 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
 #else
-    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
+    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
-    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
+    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
 #endif
 // INTERNAL, DO NOT USE
@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
-//  untill enabled back.
+//  until enabled back.
 #define log_disable() log_disable_impl()
 // INTERNAL, DO NOT USE
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
        // will be empty (default) if there are parse errors
        if (result->parsed_grammar.rules.empty()) {
            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
            delete result;
            return nullptr;
        }
@ -99,21 +100,70 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
    return std::string(result);
 }
-llama_token llama_sampling_sample(
+std::string llama_sampling_order_print(const llama_sampling_params & params) {
    std::string result = "CFG -> Penalties ";
    if (params.mirostat == 0) {
        for (auto sampler_type : params.samplers_sequence) {
            const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
            if (!sampler_type_name.empty()) {
                result += "-> " + sampler_type_name + " ";
            }
        }
    } else {
        result += "-> mirostat ";
    }
    return result;
 }
 // no reasons to expose this function in header
 static void sampler_queue(
                   struct llama_context * ctx_main,
            const llama_sampling_params & params,
                 llama_token_data_array & cur_p,
                                 size_t   min_keep) {
    const float         temp              = params.temp;
    const float         dynatemp_range    = params.dynatemp_range;
    const float         dynatemp_exponent = params.dynatemp_exponent;
    const int32_t       top_k             = params.top_k;
    const float         top_p             = params.top_p;
    const float         min_p             = params.min_p;
    const float         tfs_z             = params.tfs_z;
    const float         typical_p         = params.typical_p;
    const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
    for (auto sampler_type : samplers_sequence) {
        switch (sampler_type) {
            case llama_sampler_type::TOP_K    : llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
            case llama_sampler_type::TFS_Z    : llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
            case llama_sampler_type::TYPICAL_P: llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
            case llama_sampler_type::TOP_P    : llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
            case llama_sampler_type::MIN_P    : llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
            case llama_sampler_type::TEMPERATURE:
                if (dynatemp_range > 0) {
                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
                } else {
                    llama_sample_temp(ctx_main, &cur_p, temp);
                }
                break;
            default : break;
        }
    }
 }
 static llama_token llama_sampling_sample_impl(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx) {
+                  const int idx,
                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
    const llama_sampling_params & params = ctx_sampling->params;
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
    const float   min_p           = params.min_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
@ -128,13 +178,27 @@ llama_token llama_sampling_sample(
    llama_token id = 0;
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);
    // Declare original_logits at the beginning of the function scope
    std::vector<float> original_logits;
    if (!is_resampling) {
        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
    }
    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
    if (ctx_cfg) {
        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
    }
    cur.clear();
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@ -143,17 +207,15 @@ llama_token llama_sampling_sample(
    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
    if (ctx_cfg) {
        llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
    }
    // apply penalties
-    if (!prev.empty()) {
+    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
    if (penalty_tokens_used_size) {
        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                prev.data() + prev.size() - penalty_last_n,
+                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
+                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
@ -165,7 +227,8 @@ llama_token llama_sampling_sample(
        }
    }
-    if (ctx_sampling->grammar != NULL) {
+    // If we are in the resampling phase, apply grammar checks before sampling logic
    if (is_resampling && ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }
@ -186,14 +249,9 @@ llama_token llama_sampling_sample(
            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
        } else {
            // temperature sampling
-            size_t min_keep = std::max(1, params.n_probs);
+            size_t min_keep = std::max(1, params.min_keep);
-            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
+            sampler_queue(ctx_main, params, cur_p, min_keep);
            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
            llama_sample_temp     (ctx_main, &cur_p, temp);
            id = llama_sample_token(ctx_main, &cur_p);
@ -208,13 +266,123 @@ llama_token llama_sampling_sample(
            //    }
            //}
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
+            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
        }
    }
    if (ctx_sampling->grammar != NULL && !is_resampling) {
        // Create an array with a single token data element for the sampled id
        llama_token_data single_token_data = {id, logits[id], 0.0f};
        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
        // Apply grammar constraints to the single token
        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
        // If the token is not valid according to the grammar, perform resampling
        if (!is_valid) {
            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
            // Restore logits from the copy
            std::copy(original_logits.begin(), original_logits.end(), logits);
            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
        }
    }
    return id;
 }
 static llama_token_data_array llama_sample_probability_distribution_impl(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx) {
    const llama_sampling_params & params = ctx_sampling->params;
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
    const float   penalty_present = params.penalty_present;
    const bool    penalize_nl     = params.penalize_nl;
    auto & prev = ctx_sampling->prev;
    auto & cur  = ctx_sampling->cur;
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);
    // Declare original_logits at the beginning of the function scope
    std::vector<float> original_logits;
    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
    if (ctx_cfg) {
        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
    }
    cur.clear();
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
    // apply penalties
    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
    if (penalty_tokens_used_size) {
        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
        llama_sample_repetition_penalties(ctx_main, &cur_p,
                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
                    cur_p.data[idx].logit = nl_logit;
                    break;
                }
            }
        }
    }
    // apply grammar checks
    if (ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }
    llama_sample_softmax(ctx_main, &cur_p);
    return cur_p;
 }
 llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx) {
    // Call the implementation function with is_resampling set to false by default
    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }
 llama_token_data_array llama_sampling_probability_distribution(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
                  const int idx) {
    return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
 }
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/sampling.h
+++ b/common/sampling.h
@ -8,24 +8,46 @@
 #include <vector>
 #include <unordered_map>
 // sampler types
 enum class llama_sampler_type : char {
    TOP_K       = 'k',
    TOP_P       = 'p',
    MIN_P       = 'm',
    TFS_Z       = 'f',
    TYPICAL_P   = 'y',
    TEMPERATURE = 't'
 };
 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t     n_prev                = 64;       // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t top_k             = 40;    // <= 0 to use vocab size
+    int32_t     min_keep              = 0;        // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    float   top_p             = 0.95f; // 1.0 = disabled
+    int32_t     top_k                 = 40;       // <= 0 to use vocab size
-    float   min_p             = 0.05f; // 0.0 = disabled
+    float       top_p                 = 0.95f;    // 1.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float       min_p                 = 0.05f;    // 0.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
+    float       tfs_z                 = 1.00f;    // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
+    float       typical_p             = 1.00f;    // 1.0 = disabled
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   penalty_repeat    = 1.10f; // 1.0 = disabled
+    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
-    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
-    float   mirostat_tau      = 5.00f; // target entropy
+    float       penalty_freq          = 0.00f;    // 0.0 = disabled
-    float   mirostat_eta      = 0.10f; // learning rate
+    float       penalty_present       = 0.00f;    // 0.0 = disabled
-    bool    penalize_nl       = true;  // consider newlines as a repeatable token
+    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float       mirostat_tau          = 5.00f;    // target entropy
    float       mirostat_eta          = 0.10f;    // learning rate
    bool        penalize_nl           = true;     // consider newlines as a repeatable token
    std::vector<llama_sampler_type> samplers_sequence = {
        llama_sampler_type::TOP_K,
        llama_sampler_type::TFS_Z,
        llama_sampler_type::TYPICAL_P,
        llama_sampler_type::TOP_P,
        llama_sampler_type::MIN_P,
        llama_sampler_type::TEMPERATURE
    };
    std::string grammar;  // optional BNF-like grammar to constrain sampling
@ -35,6 +57,9 @@ typedef struct llama_sampling_params {
    float       cfg_scale     = 1.f; // how strong is guidance
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    std::vector<llama_token> penalty_prompt_tokens;
    bool                     use_penalty_prompt_tokens = false;
 } llama_sampling_params;
 // general sampler context
@ -80,6 +105,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
 // Print sampling parameters into a string
 std::string llama_sampling_print(const llama_sampling_params & params);
 // Print sampling order into a string
 std::string llama_sampling_order_print(const llama_sampling_params & params);
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
@ -103,6 +131,13 @@ llama_token llama_sampling_sample(
        struct llama_context * ctx_cfg,
        int idx = 0);
 // returns the probability that token of given id will be sampled
 llama_token_data_array llama_sampling_probability_distribution(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        struct llama_context * ctx_cfg,
        int idx = 0);
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/train.cpp
+++ b/common/train.cpp
@ -31,7 +31,7 @@ struct train_state  * init_train_state() {
    state->opt = new struct ggml_opt_context;
    state->opt->ctx = NULL;
-    state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
    state->opt->loss_after = 0.0f;
@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
    float scale = 1.0f; // xavier
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
        case 1:
            scale /= sqrtf((float) tensor->ne[0]);
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
 }
 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
        case 1:
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
 }
 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
    GGML_ASSERT(tensor->n_dims == 1);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == 1);
    GGML_ASSERT(tensor->ne[2] == 1);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
    GGML_ASSERT(tensor->n_dims == 2);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == 1);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
    GGML_ASSERT(tensor->n_dims == 3);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
    GGML_ASSERT(tensor->n_dims == 4);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
    bool                   sample_random_offsets
 ) {
    GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(target_probs->n_dims  == 3);
+    GGML_ASSERT(ggml_is_3d(target_probs));
    int64_t n_vocab  = target_probs->ne[0];
    int64_t n_tokens = tokens_input->ne[0];
    int64_t n_batch  = tokens_input->ne[1];
@ -554,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
    std::string opt_type;
    GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE);
    if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) {
-        opt->params.type = GGML_OPT_ADAM;
+        opt->params.type = GGML_OPT_TYPE_ADAM;
        GGUF_GET_KEY(fctx, opt->adam.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS);
        GGUF_GET_KEY(fctx, opt->adam.fx_prev,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS);
@ -566,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
        copy_tensor_by_name(opt->adam.v,  f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS);
        copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES);
    } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) {
-        opt->params.type = GGML_OPT_LBFGS;
+        opt->params.type = GGML_OPT_TYPE_LBFGS;
        GGUF_GET_KEY(fctx, opt->params.lbfgs.m,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT);
        GGUF_GET_KEY(fctx, opt->lbfgs.fx_best,          gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS);
@ -601,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
    gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized);
    switch (opt->params.type) {
-        case GGML_OPT_ADAM:
+        case GGML_OPT_TYPE_ADAM:
            {
                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM);
                gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS,            opt->adam.fx_best);
@ -620,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
                    gguf_add_tensor(fctx, opt->adam.pf);
                }
            } break;
-        case GGML_OPT_LBFGS:
+        case GGML_OPT_TYPE_LBFGS:
            {
                gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS);
                gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m);
@ -1105,7 +1107,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
@ -1361,12 +1363,12 @@ bool consume_common_train_arg(
                *invalid_param = true;
                return true;
            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            if (llama_supports_gpu_offload()) {
-            params->n_gpu_layers = std::stoi(argv[i]);
+                params->n_gpu_layers = std::stoi(argv[i]);
-#else
+            } else {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
+            }
    } else if (arg == "-h" || arg == "--help") {
        params->print_usage = true;
        return true;
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@ -2,6 +2,7 @@
 from __future__ import annotations
 import argparse
 import os
 import struct
 import sys
 from enum import IntEnum
@ -9,7 +10,6 @@ from pathlib import Path
 import numpy as np
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
-    vocab = convert.load_vocab(
+    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
-        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
+    vocab_factory = convert.VocabFactory(vocab_path)
-        cfg.vocabtype)
+    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
    # FIXME: Respect cfg.vocab_dir?
    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
                               load_merges = cfg.vocabtype == 'bpe',
                               n_vocab = vocab.vocab_size)
    convert.check_vocab_size(params, vocab)
-    return (params, vocab, svocab)
+    return params, vocab, special_vocab
 def handle_args():
@ -402,8 +398,8 @@ def handle_args():
                        help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
    parser.add_argument("--vocab-dir", type=Path,
                        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
-    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
+    parser.add_argument("--vocabtype", default="spm,hfft",
-                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
+                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
    return parser.parse_args()
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -3,51 +3,21 @@ from __future__ import annotations
 import json
 import os
 import re
 import struct
 import sys
 from pathlib import Path
 from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
 HF_SUBLAYER_TO_GGML = {
    "self_attn.q_proj": "attn_q",
    "self_attn.k_proj": "attn_k",
    "self_attn.v_proj": "attn_v",
    "self_attn.o_proj": "attn_output",
    "mlp.gate_proj": "ffn_gate",
    "mlp.down_proj": "ffn_down",
    "mlp.up_proj": "ffn_up",
    "input_layernorm": "attn_norm",
    "post_attention_layernorm": "ffn_norm",
 }
 def translate_tensor_name(t: str) -> str:
    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
    if match:
        nn = match.group(1)
        sub_layer = match.group(2)
        lora_type = match.group(3)
        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
        if sub_layer_renamed is None:
            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
            sys.exit(1)
        output_string = (
            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
        )
        return output_string
    else:
        print(f"Error: unrecognized tensor {t}")
        sys.exit(1)
 def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
@ -61,9 +31,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(struct.pack("i", int(params["lora_alpha"])))
-def write_tensor_header(
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
@ -78,60 +46,103 @@ def write_tensor_header(
    fout.seek((fout.tell() + 31) & -32)
-if len(sys.argv) != 2:
+if __name__ == '__main__':
-    print(f"Usage: python {sys.argv[0]} <path>")
+    if len(sys.argv) < 2:
-    print(
+        print(f"Usage: python {sys.argv[0]} <path> [arch]")
-        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+        print(
-    )
+            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
-    sys.exit(1)
+        )
        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
        sys.exit(1)
-input_json = os.path.join(sys.argv[1], "adapter_config.json")
+    input_json = os.path.join(sys.argv[1], "adapter_config.json")
-input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
-output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
-model = torch.load(input_model, map_location="cpu")
+    if os.path.exists(input_model):
        model = torch.load(input_model, map_location="cpu")
    else:
        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
        # lazy import load_file only if lora is in safetensors format.
        from safetensors.torch import load_file
        model = load_file(input_model, device="cpu")
-with open(input_json, "r") as f:
+    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
    params = json.load(f)
-if params["peft_type"] != "LORA":
+    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
-    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+        print(f"Error: unsupported architecture {arch_name}")
-    sys.exit(1)
+        sys.exit(1)
-if params["fan_in_fan_out"] is True:
+    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
-    print("Error: param fan_in_fan_out is not supported")
+    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
    sys.exit(1)
-if params["bias"] is not None and params["bias"] != "none":
+    with open(input_json, "r") as f:
-    print("Error: param bias is not supported")
+        params = json.load(f)
    sys.exit(1)
-# TODO: these seem to be layers that have been trained but without lora.
+    if params["peft_type"] != "LORA":
-# doesn't seem widely used but eventually should be supported
+        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
-if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+        sys.exit(1)
    print("Error: param modules_to_save is not supported")
    sys.exit(1)
-with open(output_path, "wb") as fout:
+    if params["fan_in_fan_out"] is True:
-    fout.truncate()
+        print("Error: param fan_in_fan_out is not supported")
        sys.exit(1)
-    write_file_header(fout, params)
+    if params["bias"] is not None and params["bias"] != "none":
-    for k, v in model.items():
+        print("Error: param bias is not supported")
-        if k.endswith(".default.weight"):
+        sys.exit(1)
-            k = k.replace(".default.weight", ".weight")
+
-        if k in ["llama_proj.weight", "llama_proj.bias"]:
+    # TODO: these seem to be layers that have been trained but without lora.
-            continue
+    # doesn't seem widely used but eventually should be supported
-        if k.endswith("lora_A.weight"):
+    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
-            if v.dtype != torch.float16 and v.dtype != torch.float32:
+        print("Error: param modules_to_save is not supported")
        sys.exit(1)
    with open(output_path, "wb") as fout:
        fout.truncate()
        write_file_header(fout, params)
        for k, v in model.items():
            orig_k = k
            if k.endswith(".default.weight"):
                k = k.replace(".default.weight", ".weight")
            if k in ["llama_proj.weight", "llama_proj.bias"]:
                continue
            if k.endswith("lora_A.weight"):
                if v.dtype != torch.float16 and v.dtype != torch.float32:
                    v = v.float()
                v = v.T
            else:
                v = v.float()
            v = v.T
        else:
            v = v.float()
-        t = v.detach().numpy()
+            t = v.detach().numpy()
        tname = translate_tensor_name(k)
        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
        write_tensor_header(fout, tname, t.shape, t.dtype)
        t.tofile(fout)
-print(f"Converted {input_json} and {input_model} to {output_path}")
+            prefix = "base_model.model."
            if k.startswith(prefix):
                k = k[len(prefix) :]
            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
            if k.endswith(lora_suffixes):
                suffix = k[-len(lora_suffixes[0]):]
                k = k[: -len(lora_suffixes[0])]
            else:
                print(f"Error: unrecognized tensor name {orig_k}")
                sys.exit(1)
            tname = name_map.get_name(k)
            if tname is None:
                print(f"Error: could not map tensor name {orig_k}")
                print(" Note: the arch parameter must be specified if the model is not llama")
                sys.exit(1)
            if suffix == ".lora_A.weight":
                tname += ".weight.loraA"
            elif suffix == ".lora_B.weight":
                tname += ".weight.loraB"
            else:
                assert False
            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
            write_tensor_header(fout, tname, t.shape, t.dtype)
            t.tofile(fout)
    print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -1,10 +1,13 @@
-import torch
+#!/usr/bin/env python3
 import os
 from pprint import pprint
 import sys
 import argparse
 import os
 import sys
 from pathlib import Path
 from pprint import pprint
 import torch
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -68,7 +71,7 @@ def main():
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
-    tensors = {}
+    tensors: dict[str, torch.Tensor] = {}
    _flatten_dict(persimmon_model['model'], tensors, None)
    arch = gguf.MODEL_ARCH.PERSIMMON
@ -85,7 +88,8 @@ def main():
    gguf_writer.add_embedding_length(hidden_size)
    gguf_writer.add_block_count(block_count)
    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
-    gguf_writer.add_rope_dimension_count(hidden_size // head_count)
+    # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
    gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
    gguf_writer.add_head_count(head_count)
    gguf_writer.add_head_count_kv(head_count_kv)
    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
--- a/convert.py
+++ b/convert.py
@ -10,6 +10,7 @@ import itertools
 import json
 import math
 import mmap
 import os
 import pickle
 import re
 import signal
@ -26,7 +27,6 @@ from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
 import numpy as np
 from sentencepiece import SentencePieceProcessor
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -42,6 +42,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 ARCH = gguf.MODEL_ARCH.LLAMA
 DEFAULT_CONCURRENCY = 8
 #
 # data types
 #
@ -62,10 +63,10 @@ class UnquantizedDataType(DataType):
    pass
-DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
+DT_F16  = UnquantizedDataType('F16',  dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
-DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
+DT_F32  = UnquantizedDataType('F32',  dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
-DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
+DT_I32  = UnquantizedDataType('I32',  dtype = np.dtype(np.int16),   valid_conversions = [])
-DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
+DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16),  valid_conversions = ['F32', 'F16', 'Q8_0'])
@dataclass(frozen=True)
@ -151,14 +152,16 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
@dataclass
 class Params:
-    n_vocab:    int
+    n_vocab:        int
-    n_embd:     int
+    n_embd:         int
-    n_layer:    int
+    n_layer:        int
-    n_ctx:      int
+    n_ctx:          int
-    n_ff:       int
+    n_ff:           int
-    n_head:     int
+    n_head:         int
-    n_head_kv:  int
+    n_head_kv:      int
-    f_norm_eps: float
+    n_experts:      int | None = None
    n_experts_used: int | None = None
    f_norm_eps:     float | None = None
    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
@ -233,6 +236,13 @@ class Params:
            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
        n_experts      = None
        n_experts_used = None
        if "num_local_experts" in config:
            n_experts = config["num_local_experts"]
            n_experts_used = config["num_experts_per_tok"]
        return Params(
            n_vocab           = config["vocab_size"],
            n_embd            = config["hidden_size"],
@ -241,6 +251,8 @@ class Params:
            n_ff              = config["intermediate_size"],
            n_head            = (n_head := config["num_attention_heads"]),
            n_head_kv         = config.get("num_key_value_heads", n_head),
            n_experts         = n_experts,
            n_experts_used    = n_experts_used,
            f_norm_eps        = config["rms_norm_eps"],
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
@ -255,8 +267,15 @@ class Params:
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))
        n_experts      = None
        n_experts_used = None
        f_rope_freq_base = None
        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if config.get("rope_theta") == 1000000:
+        if config.get("moe"):
            # Mixtral
            n_ctx = 32768
        elif config.get("rope_theta") == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@ -266,16 +285,27 @@ class Params:
            # LLaMA v1
            n_ctx = 2048
        if "layers.0.feed_forward.w1.weight" in model:
            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
        if config.get("moe"):
            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
            n_experts      = config["moe"]["num_experts"]
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6
        return Params(
-            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
+            n_vocab          = model["tok_embeddings.weight"].shape[0],
            n_embd           = config["dim"],
            n_layer          = config["n_layers"],
            n_ctx            = n_ctx,
-            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
+            n_ff             = n_ff,
            n_head           = (n_head := config["n_heads"]),
            n_head_kv        = config.get("n_kv_heads", n_head),
            n_experts        = n_experts,
            n_experts_used   = n_experts_used,
            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config.get("rope_theta"),
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
        )
    @staticmethod
@ -304,6 +334,10 @@ class Params:
 class BpeVocab:
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
        if isinstance(self.bpe_tokenizer.get('model'), dict):
            self.vocab = self.bpe_tokenizer["model"]["vocab"]
        else:
            self.vocab = self.bpe_tokenizer
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
@ -321,7 +355,7 @@ class BpeVocab:
                    # Added tokens here can be duplicates of the main vocabulary.
                    if item['content'] not in self.bpe_tokenizer)
-        vocab_size: int = len(self.bpe_tokenizer)
+        vocab_size: int = len(self.vocab)
        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
@ -329,6 +363,7 @@ class BpeVocab:
            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_dict    = added_tokens
        self.added_tokens_list    = [text for (text, idx) in items]
        self.vocab_size_base: int = vocab_size
        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
@ -336,10 +371,9 @@ class BpeVocab:
        self.fname_added_tokens   = fname_added_tokens
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        tokenizer = self.bpe_tokenizer
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
-        for i, _ in enumerate(tokenizer):
+        for i, _ in enumerate(self.vocab):
            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@ -374,6 +408,7 @@ class SentencePieceVocab:
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict = added_tokens
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
@ -417,7 +452,109 @@ class SentencePieceVocab:
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
+class HfVocab:
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
                "To use HfVocab, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
        self.tokenizer = AutoTokenizer.from_pretrained(
            fname_tokenizer,
            cache_dir=fname_tokenizer,
            local_files_only=True,
        )
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
        self.added_tokens_ids  = set()
        # Process added tokens
        for tok, tokidx in sorted(
            self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
        ):
            # Only consider added tokens that are not in the base vocabulary
            if tokidx >= self.tokenizer.vocab_size:
                self.added_tokens_list.append(tok)
                self.added_tokens_dict[tok] = tokidx
                self.added_tokens_ids.add(tokidx)
        # Store special tokens and their IDs
        self.specials = {
            tok: self.tokenizer.get_vocab()[tok]
            for tok in self.tokenizer.all_special_tokens
        }
        self.special_ids = set(self.tokenizer.all_special_ids)
        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens
    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
        for token_id in range(self.vocab_size_base):
            # Skip processing added tokens here
            if token_id in self.added_tokens_ids:
                continue
            # Convert token text to bytes
            token_text = reverse_vocab[token_id].encode("utf-8")
            # Yield token text, score, and type
            yield token_text, self.get_token_score(token_id), self.get_token_type(
                token_id, token_text, self.special_ids  # Reuse already stored special IDs
            )
    def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
        # Special case for byte tokens
        if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
            return gguf.TokenType.BYTE
        # Determine token type based on whether it's a special token
        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
        # This needs to be implemented based on specific requirements
        return -1000.0  # Default score
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
                score = self.get_token_score(self.specials[text])
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
            yield text.encode("utf-8"), score, toktype
    def has_newline_token(self):
        return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.hf_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
 #
 # data loading
@ -585,14 +722,14 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
        # Transformers models put different tensors in different files, but
-        # don't split indivdual tensors between files.
+        # don't split individual tensors between files.
        model: LazyModel = {}
        for mp in models_plus:
            model.update(mp.model)
    else:
        model = merge_sharded([mp.model for mp in models_plus])
-    return ModelPlus(model, paths, format, vocab)
+    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -678,7 +815,7 @@ class LazyUnpickler(pickle.Unpickler):
        return func(*args)
    CLASSES: dict[tuple[str, str], Any] = {
-        # getattr used here as a workaround for mypy not being smart enough to detrmine
+        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
@ -773,7 +910,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
        executor_class = ProcessPoolExecutor
    else:
        executor_class = ThreadPoolExecutor
-    with executor_class(max_workers = max_workers) as executor:
+    with executor_class(max_workers=max_workers) as executor:
        futures: list[concurrent.futures.Future[Out]] = []
        done = False
        for _ in range(concurrency):
@ -794,21 +931,36 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
            yield result
-def check_vocab_size(params: Params, vocab: Vocab) -> None:
+def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
-    if params.n_vocab != vocab.vocab_size:
+    # Handle special case where the model's vocab size is not set
-        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
+    if params.n_vocab == -1:
-        if params.n_vocab == vocab.vocab_size_base:
+        raise ValueError(
-            print("Ignoring added_tokens.json since model matches vocab size without it.")
+            f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
-            vocab.added_tokens_list = []
+        )
-            vocab.vocab_size = vocab.vocab_size_base
+
-            return
+    # Check for a vocab size mismatch
-        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
+    if params.n_vocab == vocab.vocab_size:
-        if vocab.fname_added_tokens is not None:
+        print("Ignoring added_tokens.json since model matches vocab size without it.")
-            msg += f" combined with {vocab.fname_added_tokens}"
+        return
-        msg += f" has {vocab.vocab_size})."
+
-        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+    if pad_vocab and params.n_vocab > vocab.vocab_size:
-            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        pad_count = params.n_vocab - vocab.vocab_size
-        raise Exception(msg)
+        print(
            f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
        )
        for i in range(1, pad_count + 1):
            vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
            vocab.added_tokens_list.append(f"<dummy{i:05}>")
        vocab.vocab_size = params.n_vocab
        return
    msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})."
    if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
        msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
    if vocab.vocab_size < params.n_vocab:
        msg += " Add the --pad-vocab option and try again."
    raise Exception(msg)
 class OutputFile:
@ -832,7 +984,17 @@ class OutputFile:
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
-        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
+
        if params.n_experts:
            self.gguf.add_expert_count(params.n_experts)
        if params.n_experts_used:
            self.gguf.add_expert_used_count(params.n_experts_used)
        if params.f_norm_eps:
            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
        else:
            raise ValueError('f_norm_eps is None')
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -851,22 +1013,46 @@ class OutputFile:
        if params.ftype is not None:
            self.gguf.add_file_type(params.ftype)
-    def add_meta_vocab(self, vocab: Vocab) -> None:
+    def handle_tokenizer_model(self, vocab: Vocab) -> str:
        # Map the vocab types to the supported tokenizer models
        tokenizer_model = {
            SentencePieceVocab: "llama",
            HfVocab: "llama",
            BpeVocab: "gpt2",
        }.get(type(vocab))
        # Block if vocab type is not predefined
        if tokenizer_model is None:
            raise ValueError("Unknown vocab type: Not supported")
        return tokenizer_model
    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
        tokens = []
        scores = []
        toktypes = []
        # NOTE: `all_tokens` returns the base vocabulary and added tokens
        for text, score, toktype in vocab.all_tokens():
            tokens.append(text)
            scores.append(score)
            toktypes.append(toktype)
-        if isinstance(vocab, SentencePieceVocab):
+        assert len(tokens) == vocab.vocab_size
-            self.gguf.add_tokenizer_model("llama")
+
-        elif isinstance(vocab, BpeVocab):
+        return tokens, scores, toktypes
-            self.gguf.add_tokenizer_model("gpt2")
+
-        else:
+    def add_meta_vocab(self, vocab: Vocab) -> None:
-            raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
+        # Handle the tokenizer model
        tokenizer_model = self.handle_tokenizer_model(vocab)
        # Ensure that tokenizer_model is added to the GGUF model
        self.gguf.add_tokenizer_model(tokenizer_model)
        # Extract model vocabulary for model conversion
        tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
        # Add extracted token information for model conversion
        self.gguf.add_token_list(tokens)
        self.gguf.add_token_scores(scores)
        self.gguf.add_token_types(toktypes)
@ -879,7 +1065,7 @@ class OutputFile:
        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
    def write_meta(self) -> None:
        self.gguf.write_header_to_file()
@ -892,8 +1078,11 @@ class OutputFile:
        self.gguf.close()
    @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+    def write_vocab_only(
-        check_vocab_size(params, vocab)
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
        of = OutputFile(fname_out, endianess=endianess)
@ -920,8 +1109,12 @@ class OutputFile:
        return dt.quantize(arr)
    @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+    def write_all(
-        check_vocab_size(params, vocab)
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
        of = OutputFile(fname_out, endianess=endianess)
@ -940,7 +1133,10 @@ class OutputFile:
        # tensor data
        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
        if ftype == GGMLFileType.MostlyQ8_0:
-            ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
+            ndarrays = bounded_parallel_map(
                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
                use_processpool_executor=True,
            )
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
@ -949,14 +1145,16 @@ class OutputFile:
            elapsed = time.time() - start
            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
-            print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
+            print(
                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
            )
            of.gguf.write_tensor_data(ndarray)
        of.close()
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
-    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
+    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
        return GGMLFileType.AllF32
@ -975,7 +1173,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
            for (name, tensor) in model.items()}
-def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
+def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
    should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
@ -1001,7 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
    for name, lazy_tensor in model.items():
        tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
        if name_new is None:
-            raise Exception(f"Unexpected tensor name: {name}")
+            if skip_unknown:
                print(f"Unexpected tensor name: {name} - skipping")
                continue
            else:
                raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
        if tensor_type in should_skip:
            print(f"skipping tensor {name_new}")
@ -1079,35 +1281,71 @@ def load_some_model(path: Path) -> ModelPlus:
    return model_plus
-def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
+class VocabFactory:
-    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
-    # a directory, it might be the model directory, and tokenizer.model might
+
-    # be in the parent of that.
+    def __init__(self, path: Path):
-    if path.is_dir():
+        self.path = path
-        vocab_file = "tokenizer.model"
+        self.file_paths = self._detect_files()
-        if vocabtype == 'bpe':
+        print(f"Found vocab files: {self.file_paths}")
-            vocab_file = "vocab.json"
+
-        path2 = path / vocab_file
+    def _detect_files(self) -> dict[str, Path | None]:
-        # Use `.parent` instead of /.. to handle the symlink case better.
+        def locate(file: str) -> Path | None:
-        path3 = path.parent / vocab_file
+            if (path := self.path / file).exists():
-        if path2.exists():
+                return path
-            path = path2
+            if (path := self.path.parent / file).exists():
-        elif path3.exists():
+                return path
-            path = path3
+            return None
        return {vt: locate(f) for vt, f in self._FILES.items()}
    def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
        for vtype in vocab_types:
            try:
                path = self.file_paths[vtype]
            except KeyError:
                raise ValueError(f"Unsupported vocabulary type {vtype}") from None
            if path is not None:
                return vtype, path
        raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
        load_merges = vocabtype == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
            model_parent_path,
            load_merges=load_merges,
            special_token_types=None,  # Predetermined or passed as a parameter
            n_vocab=n_vocab,
        )
    def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
        vocab_type, path = self._select_file(vocab_types)
        print(f"Loading vocab file {path!r}, type {vocab_type!r}")
        added_tokens_path = path.parent / "added_tokens.json"
        vocab: Vocab
        if vocab_type == "bpe":
            vocab = BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
        elif vocab_type == "spm":
            vocab = SentencePieceVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
            )
        elif vocab_type == "hfft":
            vocab = HfVocab(
                path.parent, added_tokens_path if added_tokens_path.exists() else None
            )
        else:
-            raise FileNotFoundError(
+            raise ValueError(vocab_type)
-                f"Could not find {vocab_file} in {path} or its parent; "
+        # FIXME: Respect --vocab-dir?
-                "if it's in another directory, pass the directory as --vocab-dir")
+        special_vocab = self._create_special_vocab(
-
+            vocab,
-    print(f"Loading vocab file '{path}', type '{vocabtype}'")
+            vocab_type,
-
+            model_parent_path,
-    added_tokens_path = path.parent / "added_tokens.json"
+        )
-    if vocabtype == "bpe":
+        return vocab, special_vocab
        return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
    elif vocabtype == "spm":
        return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
    else:
        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@ -1138,20 +1376,23 @@ def main(args_in: list[str] | None = None) -> None:
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
-    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
+    parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
-    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
+    parser.add_argument("--dump",         action="store_true",    help="don't convert, just show what's in the model")
-    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
+    parser.add_argument("--dump-single",  action="store_true",    help="don't convert, just show what's in a single model file")
-    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--vocab-only",   action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--outtype",      choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
-    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
+    parser.add_argument("--vocab-dir",    type=Path,              help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("--vocab-type",                           help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
-    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
+    parser.add_argument("--outfile",      type=Path,              help="path to write to; default: based on input")
-    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
+    parser.add_argument("model",          type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
+    parser.add_argument("--ctx",          type=int,               help="model training context (default: based on input)")
-    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
+    parser.add_argument("--concurrency",  type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
-    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine")
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
    args = parser.parse_args(args_in)
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
@ -1166,7 +1407,7 @@ def main(args_in: list[str] | None = None) -> None:
        do_dump_model(model_plus)
        return
    endianess = gguf.GGUFEndian.LITTLE
-    if args.bigendian:
+    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG
    params = Params.load(model_plus)
@ -1187,32 +1428,28 @@ def main(args_in: list[str] | None = None) -> None:
    print(f"params = {params}")
-    vocab: Vocab
+    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
    vocab_factory = VocabFactory(vocab_path)
    vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
    if args.vocab_only:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        # FIXME: Try to respect vocab_dir somehow?
        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
                                          load_merges = args.vocabtype == 'bpe',
                                          n_vocab = vocab.vocab_size)
        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
                                    endianess=endianess, pad_vocab=args.pad_vocab)
        print(f"Wrote {outfile}")
        return
    if model_plus.vocab is not None and args.vocab_dir is None:
        vocab = model_plus.vocab
-    else:
+
-        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+    print(f"Vocab info: {vocab}")
-        vocab = load_vocab(vocab_dir, args.vocabtype)
+    print(f"Special vocab info: {special_vocab}")
    # FIXME: Try to respect vocab_dir somehow?
    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
                                      load_merges = args.vocabtype == 'bpe',
                                      n_vocab = vocab.vocab_size)
    model   = model_plus.model
-    model   = convert_model_names(model, params)
+    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
@ -1220,7 +1457,8 @@ def main(args_in: list[str] | None = None) -> None:
    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
    print(f"Wrote {outfile}")
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -23,6 +23,9 @@ else()
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
    if (LLAMA_SYCL)
        add_subdirectory(sycl)
    endif()
    add_subdirectory(main)
    add_subdirectory(tokenize)
    add_subdirectory(parallel)
@ -31,11 +34,13 @@ else()
    add_subdirectory(quantize-stats)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(passkey)
    add_subdirectory(speculative)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(gguf)
    add_subdirectory(train-text-from-scratch)
-    if (LLAMA_METAL)
+    add_subdirectory(imatrix)
        add_subdirectory(metal)
    endif()
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -575,10 +575,7 @@ static struct ggml_tensor * forward(
            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled =
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
                ggml_scale(ctx0,
                        KQ,
                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
            // KQ_masked = mask_past(KQ_scaled)
            // KQ_masked shape [n_past + N, N, n_head, 1]
@ -844,10 +841,7 @@ static struct ggml_tensor * forward_batch(
            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled =
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
                ggml_scale(ctx0,
                        KQ,
                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);
            // KQ_masked = mask_past(KQ_scaled)
@ -1131,10 +1125,7 @@ static struct ggml_tensor * forward_lora(
            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled =
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
                ggml_scale(ctx0,
                        KQ,
                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
            // KQ_masked = mask_past(KQ_scaled)
            // KQ_masked shape [n_past + N, N, n_head, 1]
@ -1258,9 +1249,9 @@ static struct ggml_tensor * forward_lora(
 }
 static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(logits->n_dims == 2);
+    assert(ggml_is_matrix(logits));
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
-    assert(best_samples->n_dims == 1);
+    assert(ggml_is_vector(best_samples));
    assert(logits->ne[1] == best_samples->ne[0]);
    assert(logits->ne[0] == probs->ne[0]);
    assert(logits->ne[1] == probs->ne[1]);
@ -1292,9 +1283,9 @@ static void sample_softmax_batch(
    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
    struct ggml_tensor * best_samples
 ) {
-    GGML_ASSERT(best_samples->n_dims == 2);
+    GGML_ASSERT(ggml_is_matrix(best_samples));
-    GGML_ASSERT(logits->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(logits));
-    GGML_ASSERT(probs->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(probs));
    int n_tokens = best_samples->ne[0];
    int n_batch  = best_samples->ne[1];
    int n_vocab  = logits->ne[0];
@ -1334,7 +1325,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }
 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@ -1386,8 +1377,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
 static void get_example_targets_batch(
    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
 ) {
-    GGML_ASSERT(tokens_input->n_dims == 2);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(     targets->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(targets));
    int n_tokens = tokens_input->ne[0];
    int n_batch  = tokens_input->ne[1];
    GGML_ASSERT(n_tokens == targets->ne[1]);
@ -1542,27 +1533,28 @@ int main(int argc, char ** argv) {
        int n_past = 0;
-        ggml_cgraph gf = {};
+        struct ggml_cgraph * gf = NULL;
        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
-        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
-        ggml_build_forward_expand(&gf, e);
+        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
        float error_before_opt = ggml_get_f32_1d(e, 0);
-        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS);
        opt_params_lbfgs.print_forward_graph = false;
        opt_params_lbfgs.print_backward_graph = false;
        opt_params_lbfgs.lbfgs.n_iter = 16;
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
-        ggml_build_forward_expand(&gf, e);
+        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
        float error_after_opt = ggml_get_f32_1d(e, 0);
@ -1609,13 +1601,14 @@ int main(int argc, char ** argv) {
            };
            struct ggml_context * ctx0 = ggml_init(params);
-            ggml_cgraph gf = {};
+            struct ggml_cgraph * gf = NULL;
            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
            int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
-            ggml_build_forward_expand(&gf, logits);
+            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@ -0,0 +1,61 @@
 #!/bin/bash
 #
 # Few-shot translation example.
 # Requires a base model (i.e. no fine-tuned or instruct models).
 #
 # Usage:
 #
 #   cd llama.cpp
 #   make -j
 #
 #   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
 #
 if [ $# -lt 2 ]; then
  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
  exit 1
 fi
 eargs=""
 if [ $# -gt 2 ]; then
  eargs="${@:3}"
 fi
 ftmp="__llama.cpp_example_tmp__.txt"
 trap "rm -f $ftmp" EXIT
 echo "Translate from English to French:
 ===
 sea otter, peppermint, plush girafe:
 sea otter => loutre de mer
 peppermint => menthe poivrée
 plush girafe => girafe peluche
 ===
 violin
 violin => violon
 ===
 phone, computer, mouse, keyboard:
 phone => téléphone
 computer => ordinateur
 mouse => souris
 keyboard => clavier
 ===
 " > $ftmp
 echo "$2
 " >> $ftmp
 model=$1
 # generate the most likely continuation until the string "===" is found
 ./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -32,16 +32,15 @@ int main(int argc, char ** argv) {
    gpt_params params;
    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        printf("  example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
        return 1 ;
    }
    int n_kv_max     = 2048;
    int is_pp_shared = 0;
    int n_gpu_layers = 0;
    int mmq          = 0;
    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
    std::vector<int> n_tg = { 128, 256, };
@ -65,30 +64,30 @@ int main(int argc, char ** argv) {
    }
    if (argc >= 6) {
-        mmq = std::atoi(argv[5]);
+        n_pp = parse_list(argv[5]);
    }
    if (argc >= 7) {
-        n_pp = parse_list(argv[6]);
+        n_tg = parse_list(argv[6]);
    }
    if (argc >= 8) {
-        n_tg = parse_list(argv[7]);
+        n_pl = parse_list(argv[7]);
    }
    if (argc >= 9) {
        n_pl = parse_list(argv[8]);
    }
    // init LLM
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    // initialize the model
    llama_model_params model_params = llama_model_default_params();
    const std::vector<float> t_split(llama_max_devices(), 0.0f);
    model_params.n_gpu_layers = n_gpu_layers;
    model_params.tensor_split = t_split.data();
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -102,11 +101,13 @@ int main(int argc, char ** argv) {
    ctx_params.seed      = 1234;
    ctx_params.n_ctx     = n_kv_max;
    ctx_params.n_batch   = 512;
    ctx_params.mul_mat_q = mmq;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    // ensure enough sequences are available
    ctx_params.n_parallel = *std::max_element(n_pl.begin(), n_pl.end());
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (ctx == NULL) {
@ -155,7 +156,7 @@ int main(int argc, char ** argv) {
    }
    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
@ -176,10 +177,10 @@ int main(int argc, char ** argv) {
                llama_batch_clear(batch);
-                const int n_tokens = is_pp_shared ? pp : pl*pp;
+                for (int i = 0; i < pp; ++i) {
-
+                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
-                for (int i = 0; i < n_tokens; ++i) {
+                        llama_batch_add(batch, 0, i, { j }, false);
-                    llama_batch_add(batch, 0, i, { 0 }, false);
+                    }
                }
                batch.logits[batch.n_tokens - 1] = true;
@ -194,7 +195,7 @@ int main(int argc, char ** argv) {
                if (is_pp_shared) {
                    for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
+                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
                    }
                }
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.
 $ `make`
-$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -17,7 +17,7 @@ let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(argu
 let n_len: Int = 32
 // init LLM
-llama_backend_init(false)
+llama_backend_init()
 defer {
    llama_backend_free()
 }
@ -153,7 +153,7 @@ while n_cur <= n_len {
        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
        // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(context) || n_cur == n_len {
+        if new_token_id == llama_token_eos(model) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
 llama_print_timings(context)
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let n_tokens = text.count + (add_bos ? 1 : 0)
+    let utf8Count = text.utf8.count
    let n_tokens = utf8Count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
@ -230,18 +231,15 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
    var result = [CChar](repeating: 0, count: 8)
    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
    if nTokens < 0 {
-        if result.count >= -Int(nTokens) {
+        let actualTokensCount = -Int(nTokens)
-            result.removeLast(-Int(nTokens))
+        result = .init(repeating: 0, count: actualTokensCount)
        } else {
            result.removeAll()
        }
        let check = llama_token_to_piece(
            model,
            token,
            &result,
            Int32(result.count)
        )
-        assert(check == nTokens)
+        assert(check == actualTokensCount)
    } else {
        result.removeLast(result.count - Int(nTokens))
    }
@ -259,5 +257,4 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
        buffer = []
        return bufferString
    }
    return nil
 }
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -50,7 +50,8 @@ int main(int argc, char ** argv) {
    // init LLM
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    // initialize the model
@ -69,6 +70,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(model, params.prompt, true);
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
    // initialize the context
@ -78,6 +80,7 @@ int main(int argc, char ** argv) {
    ctx_params.seed  = 1234;
    ctx_params.n_ctx = n_kv_req;
    ctx_params.n_batch = std::max(n_len, n_parallel);
    ctx_params.n_parallel      = n_parallel;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -90,7 +93,7 @@ int main(int argc, char ** argv) {
    const int n_ctx    = llama_n_ctx(ctx);
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
@ -130,7 +133,7 @@ int main(int argc, char ** argv) {
    // assign the system KV cache to all parallel sequences
    // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
    for (int32_t i = 1; i < n_parallel; ++i) {
-        llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
+        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
    }
    if (n_parallel > 1) {
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -119,7 +119,8 @@ int main(int argc, char ** argv)
    // Init LLM :
    //---------------------------------
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -129,13 +129,13 @@ int main(int argc, char ** argv)  {
    const ggml_type qtype = GGML_TYPE_Q4_1;
    size_t ctx_size = 0;
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
    ctx_size += 1024*1024*16;
    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
@ -194,7 +194,7 @@ int main(int argc, char ** argv)  {
    // Set up a the benchmark matrices
    // printf("Creating new tensor q11 & Running quantize\n");
    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
    // Set up a the compute graph
    // printf("Creating new tensor q31\n");
@ -207,7 +207,7 @@ int main(int argc, char ** argv)  {
    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
    // printf("Creating new tensor q32\n");
    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -325,14 +325,14 @@ struct train_params {
 };
 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }
 static void init_model(struct my_llama_model * model) {
@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) {
    model->train_tokens = 0;
    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
    // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
    ggml_set_name(model->norm,           "norm.weight");
@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }
 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
    int ct;
-    switch (gg_weights->n_dims){
+    switch (ggml_n_dims(gg_weights)) {
        case 1:
            ct = 0;
            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -7,6 +7,64 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static std::vector<std::string> split_lines(const std::string & s) {
    std::string line;
    std::vector<std::string> lines;
    std::stringstream ss(s);
    while (std::getline(ss, line)) {
        lines.push_back(line);
    }
    return lines;
 }
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
    for (size_t i = 0; i < tokens.size(); i++) {
        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
    }
 }
 static void normalize(const float * vec, float * out, int n) {
    float norm = 0;
    for (int i = 0; i < n; i++) {
        norm += vec[i] * vec[i];
    }
    norm = sqrt(norm);
    for (int i = 0; i < n; i++) {
        out[i] = vec[i] / norm;
    }
 }
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_cache_clear(ctx);
    // run model
    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
    if (llama_decode(ctx, batch) < 0) {
        fprintf(stderr, "%s : failed to decode\n", __func__);
    }
    // normalize on copy
    for (int i = 0; i < batch.n_tokens; i++) {
        if (!batch.logits[i]) {
            continue;
        }
        // try to get sequence embeddings - supported only when pooling_type is not NONE
        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
        if (embd == NULL) {
            embd = llama_get_embeddings_ith(ctx, i);
            if (embd == NULL) {
                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
                continue;
            }
        }
        float * out = output + batch.seq_id[i][0] * n_embd;
        normalize(embd, out, n_embd);
    }
 }
 int main(int argc, char ** argv) {
    gpt_params params;
@ -29,7 +87,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
@ -55,49 +114,85 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
-    int n_past = 0;
+    // split the prompt into lines
    std::vector<std::string> prompts = split_lines(params.prompt);
-    // tokenize the prompt
+    // max batch size
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    const uint64_t n_batch = params.n_batch;
    GGML_ASSERT(params.n_batch == params.n_ctx);
    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
    for (const auto & prompt : prompts) {
        auto inp = ::llama_tokenize(ctx, prompt, true);
        if (inp.size() > n_batch) {
            inp.resize(n_batch);
        }
        inputs.push_back(inp);
    }
    // tokenization stats
    if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
+        for (int i = 0; i < (int) inputs.size(); i++) {
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            for (int j = 0; j < (int) inputs[i].size(); j++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
            }
            fprintf(stderr, "\n\n");
        }
        fprintf(stderr, "\n");
    }
-    if (embd_inp.size() > (size_t)n_ctx) {
+    // initialize batch
-        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
+    const int n_prompts = prompts.size();
-                __func__, embd_inp.size(), n_ctx);
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
        return 1;
    }
    while (!embd_inp.empty()) {
        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
        if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }
        n_past += n_tokens;
        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
    }
    // allocate output
    const int n_embd = llama_n_embd(model);
-    const auto * embeddings = llama_get_embeddings(ctx);
+    std::vector<float> embeddings(n_prompts * n_embd, 0);
    float * emb = embeddings.data();
-    for (int i = 0; i < n_embd; i++) {
+    // break into batches
-        printf("%f ", embeddings[i]);
+    int p = 0; // number of prompts processed already
    int s = 0; // number of prompts in current batch
    for (int k = 0; k < n_prompts; k++) {
        // clamp to n_batch tokens
        auto & inp = inputs[k];
        const uint64_t n_toks = inp.size();
        // encode if at capacity
        if (batch.n_tokens + n_toks > n_batch) {
            float * out = emb + p * n_embd;
            batch_decode(ctx, batch, out, s, n_embd);
            llama_batch_clear(batch);
            p += s;
            s = 0;
        }
        // add to batch
        batch_add_seq(batch, inp, s);
        s += 1;
    }
    printf("\n");
    // final batch
    float * out = emb + p * n_embd;
    batch_decode(ctx, batch, out, s, n_embd);
    // print first 3 embeddings
    for (int j = 0; j < std::min(3, n_prompts); j++) {
        fprintf(stderr, "embedding %d: ", j);
        for (int i = 0; i < n_embd; i++) {
            fprintf(stderr, "%f ", emb[j * n_embd + i]);
        }
        fprintf(stderr, "\n\n");
    }
    fprintf(stderr, "\n");
    // clean up
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
    return 0;
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -7,8 +7,6 @@
 #include <string>
 #include <thread>
 static const size_t tensor_alignment = 32;
 struct lora_info {
    std::string filename;
    float scale;
@ -245,9 +243,8 @@ static struct lora_data * load_lora(struct lora_info * info) {
    params_ggml.no_alloc   = true;
    result->ctx = ggml_init(params_ggml);
    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
    uint32_t magic   = file.read_u32();
-    if (magic != LLAMA_FILE_MAGIC_LORA) {
+    if (magic != LLAMA_FILE_MAGIC_GGLA) {
        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
    }
    uint32_t version = file.read_u32();
@ -309,7 +306,7 @@ static struct ggml_cgraph * build_graph_lora(
 ) {
    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
    if (scaling != 1.0f) {
-        ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
+        ab = ggml_scale(ctx, ab, scaling);
    }
    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
@ -338,24 +335,14 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
    params.mem_buffer = NULL;
    params.no_alloc   = true;
    struct ggml_context * ctx = NULL;
-    struct ggml_allocr * alloc = NULL;
+    struct ggml_gallocr * alloc = NULL;
-    struct ggml_cgraph * gf = NULL;
+    struct ggml_cgraph  * gf = NULL;
    ctx   = ggml_init(params);
-    alloc = ggml_allocr_new_measure(tensor_alignment);
+    alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
    size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
    ggml_allocr_free(alloc);
    ggml_free(ctx);
-    static std::vector<uint8_t> data_compute;
+    ggml_gallocr_alloc_graph(alloc, gf);
    data_compute.resize(alloc_size + tensor_alignment);
    ctx   = ggml_init(params);
    alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
    ggml_allocr_alloc_graph(alloc, gf);
    ggml_allocr_free(alloc);
    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
    static std::vector<uint8_t> data_work;
@ -364,6 +351,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
    ggml_graph_compute(gf, &cplan);
    ggml_gallocr_free(alloc);
    ggml_free(ctx);
    return true;
 }
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
 ```
-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
+The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
@ -80,9 +80,9 @@ The LORA rank can be configured for each model tensor type separately with these
  --rank-wk N                LORA rank for wk tensor (default 4)
  --rank-wv N                LORA rank for wv tensor (default 4)
  --rank-wo N                LORA rank for wo tensor (default 4)
-  --rank-w1 N                LORA rank for w1 tensor (default 4)
+  --rank-ffn_gate N          LORA rank for ffn_gate tensor (default 4)
-  --rank-w2 N                LORA rank for w2 tensor (default 4)
+  --rank-ffn_down N          LORA rank for ffn_down tensor (default 4)
-  --rank-w3 N                LORA rank for w3 tensor (default 4)
+  --rank-ffn_up N            LORA rank for ffn_up tensor (default 4)
 ```
 The LORA rank of 'norm' tensors should always be 1.
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1,17 +1,12 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "llama.h"
 #include "common.h"
 #include "train.h"
 #include <unordered_map>
 #include <vector>
 #include <cassert>
 #include <climits>
 #include <cstring>
 #include <cstdarg>
 #include <ctime>
 #include <random>
 #include <stdexcept>
 #include <algorithm>
 #include <string>
@ -19,8 +14,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static const size_t tensor_alignment = 32;
 struct my_llama_hparams {
    uint32_t n_vocab    = 32000;
    uint32_t n_ctx      = 512;
@ -67,9 +60,9 @@ struct my_llama_layer {
    struct ggml_tensor * ffn_norm;
    // ff
-    struct ggml_tensor * w1;
+    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * w2;
+    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * w3;
+    struct ggml_tensor * ffn_up;   // w3
 };
 struct my_llama_model {
@ -92,9 +85,9 @@ struct my_llama_lora_hparams {
    uint32_t n_rank_wv = 4;
    uint32_t n_rank_wo = 4;
    uint32_t n_rank_ffn_norm = 1;
-    uint32_t n_rank_w1 = 4;
+    uint32_t n_rank_ffn_gate = 4;
-    uint32_t n_rank_w2 = 4;
+    uint32_t n_rank_ffn_down = 4;
-    uint32_t n_rank_w3 = 4;
+    uint32_t n_rank_ffn_up = 4;
    uint32_t n_rank_tok_embeddings = 4;
    uint32_t n_rank_norm = 1;
    uint32_t n_rank_output = 4;
@ -124,17 +117,17 @@ struct my_llama_lora_layer {
    struct ggml_tensor * ffn_norm_b;
    // ff
-    struct ggml_tensor * w1_a;
+    struct ggml_tensor * ffn_gate_a;
-    struct ggml_tensor * w1_b;
+    struct ggml_tensor * ffn_gate_b;
-    struct ggml_tensor * w2_a;
+    struct ggml_tensor * ffn_down_a;
-    struct ggml_tensor * w2_b;
+    struct ggml_tensor * ffn_down_b;
-    struct ggml_tensor * w3_a;
+    struct ggml_tensor * ffn_up_a;
-    struct ggml_tensor * w3_b;
+    struct ggml_tensor * ffn_up_b;
 };
 struct my_llama_lora {
    struct ggml_context * ctx = NULL;
-    std::vector<uint8_t> data;
+    ggml_backend_buffer_t data;
    my_llama_lora_hparams hparams;
@ -196,13 +189,13 @@ static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
 static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    printf("%s: n_vocab               : %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    printf("%s: n_ctx                 : %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:    %u\n", __func__, params->n_embd);
+    printf("%s: n_embd                : %u\n", __func__, params->n_embd);
-    printf("%s: n_ff:      %u\n", __func__, params->n_ff);
+    printf("%s: n_ff                  : %u\n", __func__, params->n_ff);
-    printf("%s: n_head:    %u\n", __func__, params->n_head);
+    printf("%s: n_head                : %u\n", __func__, params->n_head);
-    printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    printf("%s: n_head_kv             : %u\n", __func__, params->n_head_kv);
-    printf("%s: n_layer:   %u\n", __func__, params->n_layer);
+    printf("%s: n_layer               : %u\n", __func__, params->n_layer);
    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
@ -215,9 +208,9 @@ static void print_lora_params(struct my_llama_lora_hparams * params) {
    printf("%s: n_rank_wv             : %u\n", __func__, params->n_rank_wv);
    printf("%s: n_rank_wo             : %u\n", __func__, params->n_rank_wo);
    printf("%s: n_rank_ffn_norm       : %u\n", __func__, params->n_rank_ffn_norm);
-    printf("%s: n_rank_w1             : %u\n", __func__, params->n_rank_w1);
+    printf("%s: n_rank_ffn_gate       : %u\n", __func__, params->n_rank_ffn_gate);
-    printf("%s: n_rank_w2             : %u\n", __func__, params->n_rank_w2);
+    printf("%s: n_rank_ffn_down       : %u\n", __func__, params->n_rank_ffn_down);
-    printf("%s: n_rank_w3             : %u\n", __func__, params->n_rank_w3);
+    printf("%s: n_rank_ffn_up         : %u\n", __func__, params->n_rank_ffn_up);
    printf("%s: n_rank_tok_embeddings : %u\n", __func__, params->n_rank_tok_embeddings);
    printf("%s: n_rank_norm           : %u\n", __func__, params->n_rank_norm);
    printf("%s: n_rank_output         : %u\n", __func__, params->n_rank_output);
@ -269,7 +262,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
    float rope_freq_scale = 1.0f;
    GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
    GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    GGUF_GET_KEY(ctx, rope_freq_scale,         gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
    if (rope_freq_scale != 1.0f) {
        hparams->rope_freq_scale = 1.0f / rope_freq_scale;
    }
@ -326,9 +319,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        layer.wv             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_V, i));
        layer.wo             = llama_get_model_tensor(input, tni(LLM_TENSOR_ATTN_OUT, i));
        layer.ffn_norm       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_NORM, i));
-        layer.w1             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
+        layer.ffn_gate       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_GATE, i));
-        layer.w2             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
+        layer.ffn_down       = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_DOWN, i));
-        layer.w3             = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
+        layer.ffn_up         = llama_get_model_tensor(input, tni(LLM_TENSOR_FFN_UP, i));
        assert_shape_1d(layer.attention_norm, hparams.n_embd);
        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
@ -336,9 +329,9 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd_gqa());
        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
-        assert_shape_2d(layer.w1,             hparams.n_embd, hparams.n_ff);
+        assert_shape_2d(layer.ffn_gate,       hparams.n_embd, hparams.n_ff);
-        assert_shape_2d(layer.w2,             hparams.n_ff,   hparams.n_embd);
+        assert_shape_2d(layer.ffn_down,       hparams.n_ff,   hparams.n_embd);
-        assert_shape_2d(layer.w3,             hparams.n_embd, hparams.n_ff);
+        assert_shape_2d(layer.ffn_up,         hparams.n_embd, hparams.n_ff);
    }
 }
@ -369,69 +362,12 @@ static void set_param_lora(struct my_llama_lora * lora) {
        ggml_set_param(ctx, layer.wo_b);
        ggml_set_param(ctx, layer.ffn_norm_a);
        ggml_set_param(ctx, layer.ffn_norm_b);
-        ggml_set_param(ctx, layer.w1_a);
+        ggml_set_param(ctx, layer.ffn_gate_a);
-        ggml_set_param(ctx, layer.w1_b);
+        ggml_set_param(ctx, layer.ffn_gate_b);
-        ggml_set_param(ctx, layer.w2_a);
+        ggml_set_param(ctx, layer.ffn_down_a);
-        ggml_set_param(ctx, layer.w2_b);
+        ggml_set_param(ctx, layer.ffn_down_b);
-        ggml_set_param(ctx, layer.w3_a);
+        ggml_set_param(ctx, layer.ffn_up_a);
-        ggml_set_param(ctx, layer.w3_b);
+        ggml_set_param(ctx, layer.ffn_up_b);
    }
 }
 static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora) {
    ggml_allocr_alloc(alloc, lora->tok_embeddings_a);
    ggml_allocr_alloc(alloc, lora->tok_embeddings_b);
    ggml_allocr_alloc(alloc, lora->norm_a);
    ggml_allocr_alloc(alloc, lora->norm_b);
    ggml_allocr_alloc(alloc, lora->output_a);
    ggml_allocr_alloc(alloc, lora->output_b);
    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
        auto & layer = lora->layers[i];
        ggml_allocr_alloc(alloc, layer.attention_norm_a);
        ggml_allocr_alloc(alloc, layer.attention_norm_b);
        ggml_allocr_alloc(alloc, layer.wq_a);
        ggml_allocr_alloc(alloc, layer.wq_b);
        ggml_allocr_alloc(alloc, layer.wk_a);
        ggml_allocr_alloc(alloc, layer.wk_b);
        ggml_allocr_alloc(alloc, layer.wv_a);
        ggml_allocr_alloc(alloc, layer.wv_b);
        ggml_allocr_alloc(alloc, layer.wo_a);
        ggml_allocr_alloc(alloc, layer.wo_b);
        ggml_allocr_alloc(alloc, layer.ffn_norm_a);
        ggml_allocr_alloc(alloc, layer.ffn_norm_b);
        ggml_allocr_alloc(alloc, layer.w1_a);
        ggml_allocr_alloc(alloc, layer.w1_b);
        ggml_allocr_alloc(alloc, layer.w2_a);
        ggml_allocr_alloc(alloc, layer.w2_b);
        ggml_allocr_alloc(alloc, layer.w3_a);
        ggml_allocr_alloc(alloc, layer.w3_b);
    }
    ggml_allocr_alloc(alloc, lora->tok_embeddings_a->grad);
    ggml_allocr_alloc(alloc, lora->tok_embeddings_b->grad);
    ggml_allocr_alloc(alloc, lora->norm_a->grad);
    ggml_allocr_alloc(alloc, lora->norm_b->grad);
    ggml_allocr_alloc(alloc, lora->output_a->grad);
    ggml_allocr_alloc(alloc, lora->output_b->grad);
    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
        auto & layer = lora->layers[i];
        ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
        ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
        ggml_allocr_alloc(alloc, layer.wq_a->grad);
        ggml_allocr_alloc(alloc, layer.wq_b->grad);
        ggml_allocr_alloc(alloc, layer.wk_a->grad);
        ggml_allocr_alloc(alloc, layer.wk_b->grad);
        ggml_allocr_alloc(alloc, layer.wv_a->grad);
        ggml_allocr_alloc(alloc, layer.wv_b->grad);
        ggml_allocr_alloc(alloc, layer.wo_a->grad);
        ggml_allocr_alloc(alloc, layer.wo_b->grad);
        ggml_allocr_alloc(alloc, layer.ffn_norm_a->grad);
        ggml_allocr_alloc(alloc, layer.ffn_norm_b->grad);
        ggml_allocr_alloc(alloc, layer.w1_a->grad);
        ggml_allocr_alloc(alloc, layer.w1_b->grad);
        ggml_allocr_alloc(alloc, layer.w2_a->grad);
        ggml_allocr_alloc(alloc, layer.w2_b->grad);
        ggml_allocr_alloc(alloc, layer.w3_a->grad);
        ggml_allocr_alloc(alloc, layer.w3_b->grad);
    }
 }
@ -499,12 +435,12 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
        layer.ffn_norm_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, n_embd);
        layer.ffn_norm_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_norm, 1);
-        layer.w1_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_embd);
+        layer.ffn_gate_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_embd);
-        layer.w1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w1, n_ff);
+        layer.ffn_gate_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_gate, n_ff);
-        layer.w2_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_ff);
+        layer.ffn_down_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_ff);
-        layer.w2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w2, n_embd);
+        layer.ffn_down_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_down, n_embd);
-        layer.w3_a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_embd);
+        layer.ffn_up_a   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_embd);
-        layer.w3_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_w3, n_ff);
+        layer.ffn_up_b   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, lparams.n_rank_ffn_up,   n_ff);
        ggml_set_name(layer.attention_norm_a, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_a", i));
        ggml_set_name(layer.attention_norm_b, tni(LLM_TENSOR_ATTN_NORM, ".weight.lora_b", i));
@ -518,28 +454,18 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
        ggml_set_name(layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  ".weight.lora_b", i));
        ggml_set_name(layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_a", i));
        ggml_set_name(layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  ".weight.lora_b", i));
-        ggml_set_name(layer.w1_a,             tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_a", i));
-        ggml_set_name(layer.w1_b,             tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  ".weight.lora_b", i));
-        ggml_set_name(layer.w2_a,             tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_a", i));
-        ggml_set_name(layer.w2_b,             tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  ".weight.lora_b", i));
-        ggml_set_name(layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
+        ggml_set_name(layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_a", i));
-        ggml_set_name(layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
+        ggml_set_name(layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    ".weight.lora_b", i));
    }
    set_param_lora(lora);
-    // measure data size
+    // allocate data for lora tensors
-    size_t size = 0;
+    lora->data = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
    }
    // allocate data
    struct ggml_allocr * alloc = NULL;
    lora->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
    alloc_lora(alloc, lora);
    ggml_allocr_free(alloc);
 }
 static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, float std, float min, float max) {
@ -571,12 +497,12 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
        randomize_tensor_normal(layer.ffn_norm_a, rnd);
        ggml_set_zero(layer.ffn_norm_b);
-        randomize_tensor_normal(layer.w1_a, rnd);
+        randomize_tensor_normal(layer.ffn_gate_a, rnd);
-        ggml_set_zero(layer.w1_b);
+        ggml_set_zero(layer.ffn_gate_b);
-        randomize_tensor_normal(layer.w2_a, rnd);
+        randomize_tensor_normal(layer.ffn_down_a, rnd);
-        ggml_set_zero(layer.w2_b);
+        ggml_set_zero(layer.ffn_down_b);
-        randomize_tensor_normal(layer.w3_a, rnd);
+        randomize_tensor_normal(layer.ffn_up_a, rnd);
-        ggml_set_zero(layer.w3_b);
+        ggml_set_zero(layer.ffn_up_b);
    }
    free_random_normal_distribution(rnd);
@ -585,7 +511,7 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
 static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct my_llama_model * model,
        struct my_llama_lora  * lora,
-        struct ggml_allocr    * alloc,
+        ggml_gallocr_t          alloc,
        struct ggml_context   * ctx,
        struct ggml_cgraph    * gf,
        struct ggml_cgraph    * gb,
@ -596,7 +522,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        const  int              n_tokens,
        const  int              n_batch,
        const  bool             enable_flash_attn,
-        const  bool             enable_checkpointing) {
+        const  bool             enable_checkpointing,
        const  bool             measure_only) {
    ggml_set_scratch(ctx, { 0, 0, nullptr, });
    const int n_past = 0;
@ -612,6 +539,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    const int n_rot       = hparams.n_embd_head();
    const int n_embd_head = hparams.n_embd_head();
    const int n_embd_gqa  = hparams.n_embd_gqa();
    const float rms_norm_eps    = hparams.f_norm_rms_eps;
    const float rope_freq_base  = hparams.rope_freq_base;
    const float rope_freq_scale = hparams.rope_freq_scale;
@ -627,13 +555,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    // KQ_pos - contains the positions
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    ggml_allocr_alloc(alloc, KQ_pos);
+    ggml_set_input(KQ_pos);
    if (!ggml_allocr_is_measure(alloc)) {
        int * data = (int *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
        }
    }
    // rope has so much parameters that we make a custom function for it
    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
@ -680,10 +602,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        checkpoints.push_back(t01);
    }
-    struct ggml_tensor * kv_scale = NULL;
+    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
    if (!enable_flash_attn) {
        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
    }
    for (int il = 0; il < n_layer; ++il) {
        struct my_llama_layer & layer = model->layers[il];
@ -691,13 +610,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * attention_norm = add_to_f32(ctx, layer.attention_norm, ggml_mul_mat(ctx, llayer.attention_norm_a, llayer.attention_norm_b));
        struct ggml_tensor * ffn_norm = add_to_f32(ctx, layer.ffn_norm, ggml_mul_mat(ctx, llayer.ffn_norm_a, llayer.ffn_norm_b));
-        struct ggml_tensor * wq = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
+        struct ggml_tensor * wq       = add_to_f32(ctx, layer.wq, ggml_mul_mat(ctx, llayer.wq_a, llayer.wq_b));
-        struct ggml_tensor * wk = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
+        struct ggml_tensor * wk       = add_to_f32(ctx, layer.wk, ggml_mul_mat(ctx, llayer.wk_a, llayer.wk_b));
-        struct ggml_tensor * wv = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
+        struct ggml_tensor * wv       = add_to_f32(ctx, layer.wv, ggml_mul_mat(ctx, llayer.wv_a, llayer.wv_b));
-        struct ggml_tensor * wo = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
+        struct ggml_tensor * wo       = add_to_f32(ctx, layer.wo, ggml_mul_mat(ctx, llayer.wo_a, llayer.wo_b));
-        struct ggml_tensor * w1 = add_to_f32(ctx, layer.w1, ggml_mul_mat(ctx, llayer.w1_a, llayer.w1_b));
+        struct ggml_tensor * ffn_gate = add_to_f32(ctx, layer.ffn_gate, ggml_mul_mat(ctx, llayer.ffn_gate_a, llayer.ffn_gate_b));
-        struct ggml_tensor * w2 = add_to_f32(ctx, layer.w2, ggml_mul_mat(ctx, llayer.w2_a, llayer.w2_b));
+        struct ggml_tensor * ffn_down = add_to_f32(ctx, layer.ffn_down, ggml_mul_mat(ctx, llayer.ffn_down_a, llayer.ffn_down_b));
-        struct ggml_tensor * w3 = add_to_f32(ctx, layer.w3, ggml_mul_mat(ctx, llayer.w3_a, llayer.w3_b));
+        struct ggml_tensor * ffn_up   = add_to_f32(ctx, layer.ffn_up, ggml_mul_mat(ctx, llayer.ffn_up_a, llayer.ffn_up_b));
        struct ggml_tensor * t02 = ggml_rms_norm     (ctx, cur, rms_norm_eps);                       set_name(t02, "t02");     assert_shape_2d(t02, n_embd, N*n_batch);
        struct ggml_tensor * t03 = ggml_repeat       (ctx, attention_norm, t02);                     set_name(t03, "t03");     assert_shape_2d(t03, n_embd, N*n_batch);
@ -740,11 +659,11 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        struct ggml_tensor * t22 = ggml_rms_norm     (ctx, t21, rms_norm_eps);                       set_name(t22, "t22");     assert_shape_2d(t22, n_embd, N*n_batch);
        struct ggml_tensor * t23 = ggml_repeat       (ctx, ffn_norm, t22);                           set_name(t23, "t23");     assert_shape_2d(t23, n_embd, N*n_batch);
        struct ggml_tensor * t24 = ggml_mul          (ctx, t23, t22);                                set_name(t24, "t24");     assert_shape_2d(t24, n_embd, N*n_batch);
-        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, w3, t24);                                 set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
+        struct ggml_tensor * t25 = ggml_mul_mat      (ctx, ffn_up, t24);                             set_name(t25, "t25");     assert_shape_2d(t25, n_ff, N*n_batch);
-        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, w1, t24);                                 set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
+        struct ggml_tensor * t26 = ggml_mul_mat      (ctx, ffn_gate, t24);                           set_name(t26, "t26");     assert_shape_2d(t26, n_ff, N*n_batch);
        struct ggml_tensor * t27 = ggml_silu         (ctx, t26);                                     set_name(t27, "t27");     assert_shape_2d(t27, n_ff, N*n_batch);
        struct ggml_tensor * t28 = ggml_mul          (ctx, t27, t25);                                set_name(t28, "t28");     assert_shape_2d(t28, n_ff, N*n_batch);
-        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, w2, t28);                                 set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
+        struct ggml_tensor * t29 = ggml_mul_mat      (ctx, ffn_down, t28);                           set_name(t29, "t29");     assert_shape_2d(t29, n_embd, N*n_batch);
        struct ggml_tensor * t30 = ggml_add          (ctx, t29, t21);                                set_name(t30, "t30");     assert_shape_2d(t30, n_embd, N*n_batch);
        cur = t30;
        if (enable_checkpointing) {
@ -781,43 +700,55 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
    int n_leafs_before = gb->n_leafs;
    int n_nodes_before = gb->n_nodes;
-    struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
+
    // output tensors
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
    // input gradient
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
-    ggml_allocr_alloc(alloc, t36->grad);
+    ggml_set_input(t36->grad);
    // KQ_pos
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
    // make sure base model tensors data cannot be used in viewable operations
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
    for (int il = 0; il < n_layer; ++il) {
        struct my_llama_layer & layer = model->layers[il];
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_gate, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_down, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_up, 1.0f));
    }
    // allocating checkpoints in one block to reduce memory fragmentation
    // note: they will be freed in reverse order
    for (unsigned int i = 0; i < checkpoints.size(); ++i) {
        if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-            ggml_allocr_alloc(alloc, checkpoints[i]);
+            ggml_set_input(checkpoints[i]);
        }
    }
-    ggml_allocr_alloc_graph(alloc, gb);
+    if (measure_only) {
        ggml_gallocr_reserve(alloc, gb);
    } else {
        ggml_gallocr_alloc_graph(alloc, gb);
        // set KQ_pos
        {
            int * data = (int *) KQ_pos->data;
            for (int i = 0; i < N; ++i) {
                data[i] = n_past + i;
            }
        }
    }
    // remove the additional nodes and leafs
    for (int i = n_leafs_before; i < gb->n_leafs; ++i) {
@ -867,9 +798,9 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wv,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_V);
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_wo,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT);
    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_norm,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_NORM);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w1,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_gate,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_GATE);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w2,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_down,       gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN);
-    GGUF_GET_KEY(fctx, lora->hparams.n_rank_w3,             gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
+    GGUF_GET_KEY(fctx, lora->hparams.n_rank_ffn_up,         gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_TRAINING_LORA_RANK_FFN_UP);
    init_lora(model, lora);
@ -894,12 +825,12 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
        copy_tensor_by_name(layer.wo_b,             f_ggml_ctx, ggml_get_name(layer.wo_b));
        copy_tensor_by_name(layer.ffn_norm_a,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_a));
        copy_tensor_by_name(layer.ffn_norm_b,       f_ggml_ctx, ggml_get_name(layer.ffn_norm_b));
-        copy_tensor_by_name(layer.w1_a,             f_ggml_ctx, ggml_get_name(layer.w1_a));
+        copy_tensor_by_name(layer.ffn_gate_a,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_a));
-        copy_tensor_by_name(layer.w1_b,             f_ggml_ctx, ggml_get_name(layer.w1_b));
+        copy_tensor_by_name(layer.ffn_gate_b,       f_ggml_ctx, ggml_get_name(layer.ffn_gate_b));
-        copy_tensor_by_name(layer.w2_a,             f_ggml_ctx, ggml_get_name(layer.w2_a));
+        copy_tensor_by_name(layer.ffn_down_a,       f_ggml_ctx, ggml_get_name(layer.ffn_down_a));
-        copy_tensor_by_name(layer.w2_b,             f_ggml_ctx, ggml_get_name(layer.w2_b));
+        copy_tensor_by_name(layer.ffn_down_b,       f_ggml_ctx, ggml_get_name(layer.ffn_down_b));
-        copy_tensor_by_name(layer.w3_a,             f_ggml_ctx, ggml_get_name(layer.w3_a));
+        copy_tensor_by_name(layer.ffn_up_a,         f_ggml_ctx, ggml_get_name(layer.ffn_up_a));
-        copy_tensor_by_name(layer.w3_b,             f_ggml_ctx, ggml_get_name(layer.w3_b));
+        copy_tensor_by_name(layer.ffn_up_b,         f_ggml_ctx, ggml_get_name(layer.ffn_up_b));
    }
 }
@ -937,9 +868,9 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_V,       lora->hparams.n_rank_wv);
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,     lora->hparams.n_rank_wo);
    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_NORM,     lora->hparams.n_rank_ffn_norm);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_w1);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_GATE,     lora->hparams.n_rank_ffn_gate);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_w2);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,     lora->hparams.n_rank_ffn_down);
-    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_w3);
+    gguf_set_val_u32(fctx, LLM_KV_TRAINING_LORA_RANK_FFN_UP,       lora->hparams.n_rank_ffn_up);
    gguf_add_tensor(fctx, lora->tok_embeddings_a);
    gguf_add_tensor(fctx, lora->tok_embeddings_b);
@ -963,12 +894,12 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
        gguf_add_tensor(fctx, layer.wo_b);
        gguf_add_tensor(fctx, layer.ffn_norm_a);
        gguf_add_tensor(fctx, layer.ffn_norm_b);
-        gguf_add_tensor(fctx, layer.w1_a);
+        gguf_add_tensor(fctx, layer.ffn_gate_a);
-        gguf_add_tensor(fctx, layer.w1_b);
+        gguf_add_tensor(fctx, layer.ffn_gate_b);
-        gguf_add_tensor(fctx, layer.w2_a);
+        gguf_add_tensor(fctx, layer.ffn_down_a);
-        gguf_add_tensor(fctx, layer.w2_b);
+        gguf_add_tensor(fctx, layer.ffn_down_b);
-        gguf_add_tensor(fctx, layer.w3_a);
+        gguf_add_tensor(fctx, layer.ffn_up_a);
-        gguf_add_tensor(fctx, layer.w3_b);
+        gguf_add_tensor(fctx, layer.ffn_up_b);
    }
 }
@ -1110,7 +1041,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
        name = ggml_get_name(tensor);
    }
    uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
+    uint32_t nd = ggml_n_dims(tensor);
    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                       (uint32_t)tensor->ne[1],
                       (uint32_t)tensor->ne[2],
@ -1146,9 +1077,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
        return tn_buf.data();
    };
    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC_LORA);   // magic
+    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
    file.write_u32(1); // version
    // write_hparams
    file.write_u32(lora->hparams.lora_r);
@ -1174,12 +1104,12 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
        write_tensor(&file, layer.wo_b,             tni(LLM_TENSOR_ATTN_OUT,  i, ".weight.loraB"));
        write_tensor(&file, layer.ffn_norm_a,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraA"));
        write_tensor(&file, layer.ffn_norm_b,       tni(LLM_TENSOR_FFN_NORM,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w1_a,             tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_gate_a,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraA"));
-        write_tensor(&file, layer.w1_b,             tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_gate_b,       tni(LLM_TENSOR_FFN_GATE,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w2_a,             tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_down_a,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraA"));
-        write_tensor(&file, layer.w2_b,             tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_down_b,       tni(LLM_TENSOR_FFN_DOWN,  i, ".weight.loraB"));
-        write_tensor(&file, layer.w3_a,             tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
+        write_tensor(&file, layer.ffn_up_a,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraA"));
-        write_tensor(&file, layer.w3_b,             tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
+        write_tensor(&file, layer.ffn_up_b,         tni(LLM_TENSOR_FFN_UP,    i, ".weight.loraB"));
    }
 }
@ -1209,9 +1139,9 @@ struct train_params {
    uint32_t n_rank_wv;
    uint32_t n_rank_wo;
    uint32_t n_rank_ffn_norm;
-    uint32_t n_rank_w1;
+    uint32_t n_rank_ffn_gate;
-    uint32_t n_rank_w2;
+    uint32_t n_rank_ffn_down;
-    uint32_t n_rank_w3;
+    uint32_t n_rank_ffn_up;
    uint32_t n_rank_tok_embeddings;
    uint32_t n_rank_norm;
    uint32_t n_rank_output;
@ -1222,9 +1152,9 @@ struct train_params {
    bool custom_n_rank_wv;
    bool custom_n_rank_wo;
    bool custom_n_rank_ffn_norm;
-    bool custom_n_rank_w1;
+    bool custom_n_rank_ffn_gate;
-    bool custom_n_rank_w2;
+    bool custom_n_rank_ffn_down;
-    bool custom_n_rank_w3;
+    bool custom_n_rank_ffn_up;
    bool custom_n_rank_tok_embeddings;
    bool custom_n_rank_norm;
    bool custom_n_rank_output;
@ -1256,9 +1186,9 @@ static struct train_params get_default_train_params() {
    params.n_rank_wv             = 4;
    params.n_rank_wo             = 4;
    params.n_rank_ffn_norm       = 1;
-    params.n_rank_w1             = 4;
+    params.n_rank_ffn_gate       = 4;
-    params.n_rank_w2             = 4;
+    params.n_rank_ffn_down       = 4;
-    params.n_rank_w3             = 4;
+    params.n_rank_ffn_up         = 4;
    params.n_rank_tok_embeddings = 4;
    params.n_rank_norm           = 1;
    params.n_rank_output         = 4;
@ -1269,9 +1199,9 @@ static struct train_params get_default_train_params() {
    params.custom_n_rank_wv             = false;
    params.custom_n_rank_wo             = false;
    params.custom_n_rank_ffn_norm       = false;
-    params.custom_n_rank_w1             = false;
+    params.custom_n_rank_ffn_gate       = false;
-    params.custom_n_rank_w2             = false;
+    params.custom_n_rank_ffn_down       = false;
-    params.custom_n_rank_w3             = false;
+    params.custom_n_rank_ffn_up         = false;
    params.custom_n_rank_tok_embeddings = false;
    params.custom_n_rank_norm           = false;
    params.custom_n_rank_output         = false;
@ -1302,9 +1232,9 @@ static void train_print_usage(int argc, char ** argv, const struct train_params
    fprintf(stderr, "  --rank-wk N                LORA rank for wk tensor, overrides default rank.\n");
    fprintf(stderr, "  --rank-wv N                LORA rank for wv tensor, overrides default rank.\n");
    fprintf(stderr, "  --rank-wo N                LORA rank for wo tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w1 N                LORA rank for w1 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_gate N          LORA rank for ffn_gate tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w2 N                LORA rank for w2 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_down N          LORA rank for ffn_down tensor, overrides default rank.\n");
-    fprintf(stderr, "  --rank-w3 N                LORA rank for w3 tensor, overrides default rank.\n");
+    fprintf(stderr, "  --rank-ffn_up N            LORA rank for ffn_up tensor, overrides default rank.\n");
    print_common_train_usage(argc, argv, &params->common);
 }
@ -1439,27 +1369,27 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
            }
            params->n_rank_wo = std::stoi(argv[i]);
            params->custom_n_rank_wo = true;
-        } else if (arg == "--rank-w1") {
+        } else if (arg == "--rank-ffn_gate") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w1 = std::stoi(argv[i]);
+            params->n_rank_ffn_gate = std::stoi(argv[i]);
-            params->custom_n_rank_w1 = true;
+            params->custom_n_rank_ffn_gate = true;
-        } else if (arg == "--rank-w2") {
+        } else if (arg == "--rank-ffn_down") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w2 = std::stoi(argv[i]);
+            params->n_rank_ffn_down = std::stoi(argv[i]);
-            params->custom_n_rank_w2 = true;
+            params->custom_n_rank_ffn_down = true;
-        } else if (arg == "--rank-w3") {
+        } else if (arg == "--rank-ffn_up") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params->n_rank_w3 = std::stoi(argv[i]);
+            params->n_rank_ffn_up = std::stoi(argv[i]);
-            params->custom_n_rank_w3 = true;
+            params->custom_n_rank_ffn_up = true;
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            train_print_usage(argc, argv, &default_params);
@ -1522,12 +1452,12 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
        nx += ggml_nelements(layer.wo_b);
        nx += ggml_nelements(layer.ffn_norm_a);
        nx += ggml_nelements(layer.ffn_norm_b);
-        nx += ggml_nelements(layer.w1_a);
+        nx += ggml_nelements(layer.ffn_gate_a);
-        nx += ggml_nelements(layer.w1_b);
+        nx += ggml_nelements(layer.ffn_gate_b);
-        nx += ggml_nelements(layer.w2_a);
+        nx += ggml_nelements(layer.ffn_down_a);
-        nx += ggml_nelements(layer.w2_b);
+        nx += ggml_nelements(layer.ffn_down_b);
-        nx += ggml_nelements(layer.w3_a);
+        nx += ggml_nelements(layer.ffn_up_a);
-        nx += ggml_nelements(layer.w3_b);
+        nx += ggml_nelements(layer.ffn_up_b);
    }
    return nx;
 }
@ -1581,9 +1511,9 @@ int main(int argc, char ** argv) {
    uint32_t n_rank_wv                 = params.custom_n_rank_wv             ? params.n_rank_wv             : params.lora_r;
    uint32_t n_rank_wo                 = params.custom_n_rank_wo             ? params.n_rank_wo             : params.lora_r;
    uint32_t n_rank_ffn_norm           = params.custom_n_rank_ffn_norm       ? params.n_rank_ffn_norm       : 1;
-    uint32_t n_rank_w1                 = params.custom_n_rank_w1             ? params.n_rank_w1             : params.lora_r;
+    uint32_t n_rank_ffn_gate           = params.custom_n_rank_ffn_gate       ? params.n_rank_ffn_gate       : params.lora_r;
-    uint32_t n_rank_w2                 = params.custom_n_rank_w2             ? params.n_rank_w2             : params.lora_r;
+    uint32_t n_rank_ffn_down           = params.custom_n_rank_ffn_down       ? params.n_rank_ffn_down       : params.lora_r;
-    uint32_t n_rank_w3                 = params.custom_n_rank_w3             ? params.n_rank_w3             : params.lora_r;
+    uint32_t n_rank_ffn_up             = params.custom_n_rank_ffn_up         ? params.n_rank_ffn_up         : params.lora_r;
    uint32_t n_rank_tok_embeddings     = params.custom_n_rank_tok_embeddings ? params.n_rank_tok_embeddings : params.lora_r;
    uint32_t n_rank_norm               = params.custom_n_rank_norm           ? params.n_rank_norm           : 1;
    uint32_t n_rank_output             = params.custom_n_rank_output         ? params.n_rank_output         : params.lora_r;
@ -1593,15 +1523,15 @@ int main(int argc, char ** argv) {
    lora.hparams.n_rank_wv             = n_rank_wv;
    lora.hparams.n_rank_wo             = n_rank_wo;
    lora.hparams.n_rank_ffn_norm       = n_rank_ffn_norm;
-    lora.hparams.n_rank_w1             = n_rank_w1;
+    lora.hparams.n_rank_ffn_gate       = n_rank_ffn_gate;
-    lora.hparams.n_rank_w2             = n_rank_w2;
+    lora.hparams.n_rank_ffn_down       = n_rank_ffn_down;
-    lora.hparams.n_rank_w3             = n_rank_w3;
+    lora.hparams.n_rank_ffn_up         = n_rank_ffn_up;
    lora.hparams.n_rank_tok_embeddings = n_rank_tok_embeddings;
    lora.hparams.n_rank_norm           = n_rank_norm;
    lora.hparams.n_rank_output         = n_rank_output;
    // set opt params from command line
-    opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
    opt->params.print_forward_graph     = false;
    opt->params.print_backward_graph    = false;
    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
@ -1620,8 +1550,6 @@ int main(int argc, char ** argv) {
    opt->params.adam.gclip              = params.common.adam_gclip;
    opt->params.adam.eps_f              = params.common.adam_eps_f;
    ggml_allocr * alloc = NULL;
    printf("%s: init model\n", __func__);
    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
@ -1638,9 +1566,9 @@ int main(int argc, char ** argv) {
        || (lora.hparams.n_rank_wv             != n_rank_wv)
        || (lora.hparams.n_rank_wo             != n_rank_wo)
        || (lora.hparams.n_rank_ffn_norm       != n_rank_ffn_norm)
-        || (lora.hparams.n_rank_w1             != n_rank_w1)
+        || (lora.hparams.n_rank_ffn_gate       != n_rank_ffn_gate)
-        || (lora.hparams.n_rank_w2             != n_rank_w2)
+        || (lora.hparams.n_rank_ffn_down       != n_rank_ffn_down)
-        || (lora.hparams.n_rank_w3             != n_rank_w3)
+        || (lora.hparams.n_rank_ffn_up         != n_rank_ffn_up)
        || (lora.hparams.n_rank_tok_embeddings != n_rank_tok_embeddings)
        || (lora.hparams.n_rank_norm           != n_rank_norm)
        || (lora.hparams.n_rank_output         != n_rank_output)
@ -1674,7 +1602,7 @@ int main(int argc, char ** argv) {
    printf("%s: seen train_samples     %llu\n", __func__, (long long unsigned) train->train_samples);
    printf("%s: seen train_tokens      %llu\n", __func__, (long long unsigned) train->train_tokens);
    printf("%s: completed train_epochs %llu\n", __func__, (long long unsigned) train->train_epochs);
-    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + lora.data.size()), (float) (ggml_used_mem(lora.ctx) + lora.data.size()) / (1024.0f*1024.0f));
+    printf("%s: lora_size = %zu bytes (%.1f MB)\n", __func__, (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)), (float) (ggml_used_mem(lora.ctx) + ggml_backend_buffer_get_size(lora.data)) / (1024.0f*1024.0f));
    if (params.only_write_lora) {
        save_train_files_data save_data;
@ -1701,10 +1629,6 @@ int main(int argc, char ** argv) {
    int n_vocab  = model.hparams.n_vocab;
    int n_batch  = params.common.n_batch;
    std::vector<uint8_t> mem_input_data;
    std::vector<uint8_t> mem_compute_data;
    // context for input tensors without their data
    struct ggml_init_params ctx_input_params = {
        ggml_tensor_overhead() * 2, // mem_size
@ -1717,18 +1641,11 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * tokens_input  = ggml_new_tensor_2d(ctx_input, GGML_TYPE_I32, n_tokens, n_batch);
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
    // measure required memory for input tensors
    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
                            tensor_alignment;
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
    // allocate input tensors
-    mem_input_data.resize(max_input_size);
+    // measure required memory for input tensors
-    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_backend_buffer_t input_data = ggml_backend_alloc_ctx_tensors_from_buft(ctx_input, ggml_backend_cpu_buffer_type());
-    ggml_allocr_alloc(alloc, tokens_input);
+    size_t max_input_size = ggml_backend_buffer_get_size(input_data);
-    ggml_allocr_alloc(alloc, target_probs);
+    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
    ggml_allocr_free(alloc);
    // context for compute tensors without their data
    const size_t estimated_compute_size_wo_data = (
@ -1755,7 +1672,7 @@ int main(int argc, char ** argv) {
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
-        alloc = ggml_allocr_new_measure(tensor_alignment);
+        ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
        gf->order = (enum ggml_cgraph_eval_order) order;
        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1768,14 +1685,15 @@ int main(int argc, char ** argv) {
            &logits, tokens_input, target_probs,
            n_tokens, n_batch,
            params.common.use_flash,
-            params.common.use_checkpointing
+            params.common.use_checkpointing,
            true
        );
-        size_t max_compute_size = ggml_allocr_max_size(alloc) + tensor_alignment;
+        size_t max_compute_size = ggml_gallocr_get_buffer_size(alloc, 0); // FIXME: this will still allocate the buffer
        if (max_compute_size < best_compute_size) {
            best_compute_size = max_compute_size;
            best_order = gf->order;
        }
-        ggml_allocr_free(alloc);
+        ggml_gallocr_free(alloc);
        ggml_free(ctx_compute);
    }
    size_t max_compute_size = best_compute_size;
@ -1786,9 +1704,8 @@ int main(int argc, char ** argv) {
        "invalid");
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
-    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
    gf->order = best_order;
    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1801,15 +1718,17 @@ int main(int argc, char ** argv) {
        &logits, tokens_input, target_probs,
        n_tokens, n_batch,
        params.common.use_flash,
-        params.common.use_checkpointing
+        params.common.use_checkpointing,
        false
    );
    ggml_allocr_free(alloc);
    // tokenize data
    std::vector<llama_token> train_tokens;
    std::vector<size_t> train_samples_begin;
    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data\n", __func__);
+    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
    tokenize_file(lctx,
            params.common.fn_train_data,
            params.common.sample_start,
@ -1916,6 +1835,8 @@ int main(int argc, char ** argv) {
    ggml_free(ctx_work);
    ggml_free(ctx_compute);
    ggml_free(ctx_input);
    ggml_gallocr_free(alloc);
    int64_t t1 = ggml_time_ms();
    printf("%s: total training time: ", __func__);
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@ -1,5 +1,5 @@
 set(TARGET gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -1,5 +1,4 @@
 #include "ggml.h"
 #include "llama.h"
 #include <cstdio>
 #include <cinttypes>
@ -195,7 +194,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
            // print first 10 elements
            const float * data = (const float *) cur->data;
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET imatrix)
 add_executable(${TARGET} imatrix.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@ -0,0 +1,32 @@
 # llama.cpp/examples/imatrix
 Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
 More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
 ## Usage
 ```
 ./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
 ```
 Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
 The parameters in square brackets are optional and have the following meaning:
 * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
 * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
 * `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
 * `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 ```bash
 LLAMA_CUBLAS=1 make -j
 # generate importance matrix (imatrix.dat)
 ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
 # use the imatrix to perform a Q4_K_M quantization
 ./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
 ```
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -0,0 +1,622 @@
 #include "common.h"
 #include "llama.h"
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <sstream>
 #include <thread>
 #include <mutex>
 #include <vector>
 #include <fstream>
 #include <unordered_map>
 #include <algorithm>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 struct Stats {
    std::vector<float> values;
    int ncall = 0;
 };
 struct StatParams {
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
    int         keep_every = 0;
    bool        collect_output_weight = false;
 };
 class IMatrixCollector {
 public:
    IMatrixCollector() = default;
    void set_parameters(StatParams&& params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
    void save_imatrix() const;
    bool load_imatrix(const char * file_name, bool add);
    static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
 private:
    std::unordered_map<std::string, Stats> m_stats;
    StatParams                             m_params;
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
    void save_imatrix(const char * file_name) const;
    void keep_imatrix(int ncall) const;
 };
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
    GGML_UNUSED(user_data);
    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
    if (ask) {
        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
        if (t->op != GGML_OP_MUL_MAT) return false;
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
        if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
        return true;
    }
    std::lock_guard<std::mutex> lock(m_mutex);
    // copy the data from the GPU memory if needed
    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
    if (!is_host) {
        m_src1_data.resize(ggml_nelements(src1));
        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
    }
    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
    if (t->op == GGML_OP_MUL_MAT_ID) {
        const int idx  = ((int32_t *) t->op_params)[0];
        const int n_as = ((int32_t *) t->op_params)[1];
        // the top-k selected expert ids are stored in the src0 tensor
        // for simplicity, always copy src0 to host, because it is small
        // take into account that src0 is not contiguous!
        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
        GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
        // loop over all possible experts, regardless if they are used or not in the batch
        // this is necessary to guarantee equal number of "ncall" for each tensor
        for (int ex = 0; ex < n_as; ++ex) {
            src0 = t->src[2 + ex];
            auto& e = m_stats[src0->name];
            if (e.values.empty()) {
                e.values.resize(src1->ne[0], 0);
            }
            else if (e.values.size() != (size_t)src1->ne[0]) {
                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
                exit(1); //GGML_ASSERT(false);
            }
            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
            //       using the following line, we can correct for that if needed
            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
            ++e.ncall;
            if (m_params.verbosity > 1) {
                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
            }
            for (int row = 0; row < (int)src1->ne[1]; ++row) {
                const int excur = m_ids[row*n_as + idx];
                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
                if (excur != ex) continue;
                const float * x = data + row * src1->ne[0];
                for (int j = 0; j < (int)src1->ne[0]; ++j) {
                    e.values[j] += x[j]*x[j];
                }
            }
            if (e.ncall > m_last_call) {
                m_last_call = e.ncall;
                if (m_last_call % m_params.n_output_frequency == 0) {
                    save_imatrix();
                }
                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
                    keep_imatrix(m_last_call);
                }
            }
        }
    } else {
        auto& e = m_stats[src0->name];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
            exit(1); //GGML_ASSERT(false);
        }
        ++e.ncall;
        if (m_params.verbosity > 1) {
            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
        }
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
            }
        }
        if (e.ncall > m_last_call) {
            m_last_call = e.ncall;
            if (m_last_call % m_params.n_output_frequency == 0) {
                save_imatrix();
            }
            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
                keep_imatrix(m_last_call);
            }
        }
    }
    return true;
 }
 void IMatrixCollector::save_imatrix() const {
    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
 }
 void IMatrixCollector::keep_imatrix(int ncall) const {
    auto file_name = m_params.ofile;
    if (file_name.empty()) file_name = "imatrix.dat";
    file_name += ".at_";
    file_name += std::to_string(ncall);
    save_imatrix(file_name.c_str());
 }
 void IMatrixCollector::save_imatrix(const char * fname) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
    out.write((const char*)&n_entries, sizeof(n_entries));
    for (auto& p : m_stats) {
        int len = p.first.size();
        out.write((const char*)&len, sizeof(len));
        out.write(p.first.c_str(), len);
        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
        int nval = p.second.values.size();
        out.write((const char*)&nval, sizeof(nval));
        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
    }
    if (m_params.verbosity > 0) {
        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
    }
 }
 bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
    std::ifstream in(imatrix_file, std::ios::binary);
    if (!in) {
        printf("%s: failed to open %s\n",__func__,imatrix_file);
        return false;
    }
    int n_entries;
    in.read((char*)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
        printf("%s: no data in file %s\n", __func__, imatrix_file);
        return false;
    }
    for (int i = 0; i < n_entries; ++i) {
        int len; in.read((char *)&len, sizeof(len));
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
            return false;
        }
        name_as_vec[len] = 0;
        std::string name{name_as_vec.data()};
        auto& e = imatrix_data[std::move(name)];
        int ncall;
        in.read((char*)&ncall, sizeof(ncall));
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
            printf("%s: failed reading number of values for entry %d\n",__func__,i);
            imatrix_data = {};
            return false;
        }
        e.values.resize(nval);
        in.read((char*)e.values.data(), nval*sizeof(float));
        if (in.fail()) {
            printf("%s: failed reading data for entry %d\n",__func__,i);
            imatrix_data = {};
            return false;
        }
        e.ncall = ncall;
    }
    return true;
 }
 bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
    if (!add) {
        m_stats.clear();
    }
    return load_imatrix(file_name, m_stats);
 }
 static IMatrixCollector g_collector;
 static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
    return g_collector.collect_imatrix(t, ask, user_data);
 }
 struct results_log_softmax {
    double log_softmax;
    float  logit;
    float  prob;
 };
 static std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
    for (float v : logits) {
        max_logit = std::max(max_logit, v);
    }
    double sum_exp = 0.0;
    for (size_t i = 0; i < logits.size(); i++) {
        // Subtract the maximum logit value from the current logit value for numerical stability
        const float logit = logits[i] - max_logit;
        const float exp_logit = expf(logit);
        sum_exp += exp_logit;
        probs[i] = exp_logit;
    }
    for (size_t i = 0; i < probs.size(); i++) {
        probs[i] /= sum_exp;
    }
    return probs;
 }
 static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
    float max_logit = logits[0];
    for (int i = 1; i < n_vocab; ++i) {
        max_logit = std::max(max_logit, logits[i]);
    }
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) {
        sum_exp += expf(logits[i] - max_logit);
    }
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }
 static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
    double & nll, double & nll2, float * logit_history, float * prob_history
 ) {
    std::mutex mutex;
    int counter = 0;
    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
        double local_nll  = 0;
        double local_nll2 = 0;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
                nll += local_nll; nll2 += local_nll2;
                break;
            }
            lock.unlock();
            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
            const double v = -results.log_softmax;
            local_nll += v;
            local_nll2 += v*v;
            logit_history[i] = results.logit;
            prob_history[i]  = results.prob;
        }
    };
    for (auto & w : workers) {
        w = std::thread(compute);
    }
    compute();
    for (auto & w : workers) {
        w.join();
    }
 }
 static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
    if (from_chunk > 0) {
        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
            return false;
        }
        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
    }
    if (int(tokens.size()) < 2*n_ctx) {
        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
                n_ctx);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return false;
    }
    std::vector<float> logit_history;
    std::vector<float> prob_history;
    if (compute_ppl) {
        logit_history.resize(tokens.size());
        prob_history.resize(tokens.size());
    }
    const int n_chunk_max = tokens.size() / n_ctx;
    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const int n_batch = params.n_batch;
    int count = 0;
    double nll = 0.0;
    double nll2 = 0.0;
    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
    std::vector<float> logits;
    if (compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
        std::vector<float> logits;
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);
            // save original token and restore it after eval
            const auto token_org = tokens[batch_start];
            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return false;
            }
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;
            if (compute_ppl && num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }
        const auto t_end = std::chrono::high_resolution_clock::now();
        if (i == 0) {
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }
        if (compute_ppl) {
            const int first = n_ctx/2;
            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
            count += n_ctx - first - 1;
            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
            fflush(stdout);
            logits.clear();
        }
    }
    printf("\n");
    if (compute_ppl) {
        nll2 /= count;
        nll /= count;
        const double ppl = exp(nll);
        nll2 -= nll * nll;
        if (nll2 > 0) {
            nll2 = sqrt(nll2/(count-1));
            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
        } else {
            printf("Unexpected negative standard deviation of log(prob)\n");
        }
    }
    return true;
 }
 int main(int argc, char ** argv) {
    StatParams sparams;
    std::string prev_result_file;
    std::string combine_files;
    bool compute_ppl = true;
    int  from_chunk  = 0;
    std::vector<char*> args;
    args.push_back(argv[0]);
    int iarg = 1;
    for (; iarg < argc-1; ++iarg) {
        std::string arg{argv[iarg]};
        if (arg == "-o" || arg == "--output-file") {
            sparams.ofile = argv[++iarg];
        }
        else if (arg == "-ofreq" || arg == "--output-frequency") {
            sparams.n_output_frequency = std::stoi(argv[++iarg]);
        }
        else if (arg == "-ow" || arg == "--output-weight") {
            sparams.collect_output_weight = std::stoi(argv[++iarg]);
        }
        else if (arg == "--verbosity") {
            sparams.verbosity = std::stoi(argv[++iarg]);
        } else if (arg == "--no-ppl") {
            compute_ppl = false;
        } else if (arg == "--keep-imatrix") {
            sparams.keep_every = std::stoi(argv[++iarg]);
        } else if (arg == "--continue-from") {
            prev_result_file = argv[++iarg];
        } else if (arg == "--combine") {
            combine_files = argv[++iarg];
        }
        else if (arg == "--from-chunk") {
            from_chunk = std::stoi(argv[++iarg]);
        } else {
            args.push_back(argv[iarg]);
        }
    }
    if (iarg < argc) {
        std::string arg{argv[iarg]};
        if (arg == "--no-ppl") {
            compute_ppl = false;
        } else {
            args.push_back(argv[iarg]);
        }
    }
    g_collector.set_parameters(std::move(sparams));
    if (!combine_files.empty()) {
        std::vector<std::string> files;
        size_t pos = 0;
        while (true) {
            auto new_pos = combine_files.find(',', pos);
            if (new_pos != std::string::npos) {
                files.emplace_back(combine_files.substr(pos, new_pos - pos));
                pos = new_pos + 1;
            } else {
                files.emplace_back(combine_files.substr(pos));
                break;
            }
        }
        if (files.size() < 2) {
            fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
            return 1;
        }
        printf("Combining the following %d files\n", int(files.size()));
        for (auto& file : files) {
            printf("    %s\n", file.c_str());
            if (!g_collector.load_imatrix(file.c_str(), true)) {
                fprintf(stderr, "Failed to load %s\n", file.c_str());
                return 1;
            }
        }
        g_collector.save_imatrix();
        return 0;
    }
    if (!prev_result_file.empty()) {
        if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
            fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
            return 1;
        }
    }
    gpt_params params;
    params.n_batch = 512;
    if (!gpt_params_parse(args.size(), args.data(), params)) {
        return 1;
    }
    params.logits_all = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);
    print_build_info();
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = gpt_random_prompt(rng);
    }
    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model_params mparams = llama_model_params_from_gpt_params(params);
    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
    if (model == NULL) {
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }
    llama_context_params cparams = llama_context_params_from_gpt_params(params);
    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
    cparams.cb_eval = ik_collect_imatrix;
    cparams.cb_eval_user_data = NULL;
    llama_context * ctx = llama_new_context_with_model(model, cparams);
    if (ctx == NULL) {
        fprintf(stderr, "%s: error: unable to create context\n", __func__);
        return 1;
    }
    const int n_ctx_train = llama_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
    }
    // print system information
    {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }
    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
    if (!OK) {
        return 1;
    }
    g_collector.save_imatrix();
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
    return 0;
 }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -202,7 +202,8 @@ int main(int argc, char ** argv) {
    std::mt19937 rng(params.seed);
    LOG("%s: llama backend init\n", __func__);
-    llama_backend_init(params.numa);
+    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model;
    llama_context * ctx;
@ -241,7 +242,7 @@ int main(int argc, char ** argv) {
    LOG("add_bos: %d\n", add_bos);
    bool suff_rm_leading_spc = params.escape;
-    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
+    if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
        params.input_suffix.erase(0, 1);
        suff_rm_leading_spc = false;
    }
@ -377,10 +378,10 @@ int main(int argc, char ** argv) {
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
-            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+            control_message = " - To return control to LLaMA, end your input with '\\'.\n"
                              " - To return control without starting a new line, end your input with '/'.\n";
        } else {
-            control_message = " - Press Return to return control to LLaMa.\n"
+            control_message = " - Press Return to return control to LLaMA.\n"
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
@ -446,8 +447,8 @@ int main(int argc, char ** argv) {
                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);
-                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
                n_past -= n_discard;
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@ -87,7 +87,21 @@ class SchemaConverter:
        elif schema_type == 'array' and 'items' in schema:
            # TODO `prefixItems` keyword
            item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
-            rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
+            list_item_operator = f'("," space {item_rule_name})'
            successive_items = ""
            min_items = schema.get("minItems", 0)
            if min_items > 0:
               first_item = f"({item_rule_name})"
               successive_items = list_item_operator * (min_items - 1)
               min_items -= 1
            else:
               first_item = f"({item_rule_name})?"
            max_items = schema.get("maxItems")
            if max_items is not None and max_items > min_items:
                successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
            else:
                successive_items += list_item_operator + "*"
            rule = f'"[" space {first_item} {successive_items} "]" space'
            return self._add_rule(rule_name, rule)
        else:
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@ -23,19 +23,22 @@ usage: ./llama-bench [options]
 options:
  -h, --help
-  -m, --model <filename>            (default: models/7B/ggml-model-q4_0.gguf)
+  -m, --model <filename>              (default: models/7B/ggml-model-q4_0.gguf)
-  -p, --n-prompt <n>                (default: 512)
+  -p, --n-prompt <n>                  (default: 512)
-  -n, --n-gen <n>                   (default: 128)
+  -n, --n-gen <n>                     (default: 128)
-  -b, --batch-size <n>              (default: 512)
+  -b, --batch-size <n>                (default: 512)
-  --memory-f32 <0|1>                (default: 0)
+  -ctk <t>, --cache-type-k <t>        (default: f16)
-  -t, --threads <n>                 (default: 16)
+  -ctv <t>, --cache-type-v <t>        (default: f16)
-  -ngl N, --n-gpu-layers <n>        (default: 99)
+  -t, --threads <n>                   (default: 112)
-  -mg i, --main-gpu <i>             (default: 0)
+  -ngl, --n-gpu-layers <n>            (default: 99)
-  -mmq, --mul-mat-q <0|1>           (default: 1)
+  -sm, --split-mode <none|layer|row>  (default: layer)
-  -ts, --tensor_split <ts0/ts1/..>
+  -mg, --main-gpu <i>                 (default: 0)
-  -r, --repetitions <n>             (default: 5)
+  -nkvo, --no-kv-offload <0|1>        (default: 0)
-  -o, --output <csv|json|md|sql>    (default: md)
+  -mmp, --mmap <0|1>                  (default: 1)
-  -v, --verbose                     (default: 0)
+  -ts, --tensor_split <ts0/ts1/..>    (default: 0)
  -r, --repetitions <n>               (default: 5)
  -o, --output <csv|json|md|sql>      (default: md)
  -v, --verbose                       (default: 0)
 Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
 ```
@ -51,6 +54,10 @@ Each test is repeated the number of times given by `-r`, and the results are ave
 For a description of the other options, see the [main example](../main/README.md).
 Note:
 - When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
 ## Examples
 ### Text generation with different models
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -20,6 +20,7 @@
 #include "llama.h"
 #include "common.h"
 #include "ggml-cuda.h"
 #include "ggml-sycl.h"
 // utils
 static uint64_t get_time_ns() {
@ -53,6 +54,13 @@ static std::vector<T> split(const std::string & str, char delim) {
    return values;
 }
 template<typename T, typename F>
 static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
    std::vector<std::string> str_values;
    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
    return str_values;
 }
 template<typename T>
 static T avg(const std::vector<T> & v) {
    if (v.empty()) {
@ -113,6 +121,17 @@ static std::string get_gpu_info() {
            id += "/";
        }
    }
 #endif
 #ifdef GGML_USE_SYCL
    int count = ggml_backend_sycl_get_device_count();
    for (int i = 0; i < count; i++) {
        char buf[128];
        ggml_sycl_get_device_description(i, buf, sizeof(buf));
        id += buf;
        if (i < count - 1) {
            id += "/";
        }
    }
 #endif
    // TODO: other backends
    return id;
@ -121,17 +140,40 @@ static std::string get_gpu_info() {
 // command line params
 enum output_formats {CSV, JSON, MARKDOWN, SQL};
 static const char * output_format_str(output_formats format) {
    switch (format) {
        case CSV:      return "csv";
        case JSON:     return "json";
        case MARKDOWN: return "md";
        case SQL:      return "sql";
        default: GGML_ASSERT(!"invalid output format");
    }
 }
 static const char * split_mode_str(llama_split_mode mode) {
    switch (mode) {
        case LLAMA_SPLIT_MODE_NONE:  return "none";
        case LLAMA_SPLIT_MODE_LAYER: return "layer";
        case LLAMA_SPLIT_MODE_ROW:   return "row";
        default: GGML_ASSERT(!"invalid split mode");
    }
 }
 struct cmd_params {
    std::vector<std::string> model;
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
    std::vector<int> n_batch;
-    std::vector<bool> f32_kv;
+    std::vector<ggml_type> type_k;
    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
-    std::vector<bool> mul_mat_q;
+    std::vector<bool> no_kv_offload;
-    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
+    std::vector<std::vector<float>> tensor_split;
    std::vector<bool> use_mmap;
    std::vector<bool> embeddings;
    int reps;
    bool verbose;
    output_formats output_format;
@ -142,12 +184,16 @@ static const cmd_params cmd_params_defaults = {
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
    /* n_batch       */ {512},
-    /* f32_kv        */ {false},
+    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
-    /* mul_mat_q     */ {true},
+    /* no_kv_offload */ {false},
-    /* tensor_split  */ {{}},
+    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* use_mmap      */ {true},
    /* embeddings    */ {false},
    /* reps          */ 5,
    /* verbose       */ false,
    /* output_format */ MARKDOWN
@ -158,24 +204,51 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help\n");
-    printf("  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    printf("  -m, --model <filename>              (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
-    printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    printf("  -ctk <t>, --cache-type-k <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -ctv <t>, --cache-type-v <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
-    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
+    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
-    printf("  -ts, --tensor_split <ts0/ts1/..>               \n");
+    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
+    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
-    printf("  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
+    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
-    printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
 }
 static ggml_type ggml_type_from_name(const std::string & s) {
    if (s == "f16") {
        return GGML_TYPE_F16;
    }
    if (s == "q8_0") {
        return GGML_TYPE_Q8_0;
    }
    if (s == "q4_0") {
        return GGML_TYPE_Q4_0;
    }
    if (s == "q4_1") {
        return GGML_TYPE_Q4_1;
    }
    if (s == "q5_0") {
        return GGML_TYPE_Q5_0;
    }
    if (s == "q5_1") {
        return GGML_TYPE_Q5_1;
    }
    return GGML_TYPE_COUNT;
 }
 static cmd_params parse_cmd_params(int argc, char ** argv) {
    cmd_params params;
    std::string arg;
@ -224,13 +297,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "--memory-f32") {
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = split<std::string>(argv[i], split_delim);
-            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
                if (gt == GGML_TYPE_COUNT) {
                    invalid_param = true;
                    break;
                }
                types.push_back(gt);
            }
            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
        } else if (arg == "-ctv" || arg == "--cache-type-v") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<std::string>(argv[i], split_delim);
            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
                if (gt == GGML_TYPE_COUNT) {
                    invalid_param = true;
                    break;
                }
                types.push_back(gt);
            }
            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
@ -245,19 +343,55 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
        } else if (arg == "-sm" || arg == "--split-mode") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<std::string>(argv[i], split_delim);
            std::vector<llama_split_mode> modes;
            for (const auto & m : p) {
                llama_split_mode mode;
                if (m == "none") {
                    mode = LLAMA_SPLIT_MODE_NONE;
                } else if (m == "layer") {
                    mode = LLAMA_SPLIT_MODE_LAYER;
                } else if (m == "row") {
                    mode = LLAMA_SPLIT_MODE_ROW;
                } else {
                    invalid_param = true;
                    break;
                }
                modes.push_back(mode);
            }
            params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
        } else if (arg == "-mg" || arg == "--main-gpu") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.main_gpu = split<int>(argv[i], split_delim);
-        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
+        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<bool>(argv[i], split_delim);
-            params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
+            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
        } else if (arg == "-mmp" || arg == "--mmap") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<bool>(argv[i], split_delim);
            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
        } else if (arg == "-embd" || arg == "--embeddings") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<bool>(argv[i], split_delim);
            params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
        } else if (arg == "-ts" || arg == "--tensor-split") {
            if (++i >= argc) {
                invalid_param = true;
@ -268,10 +402,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                const std::regex regex{R"([;/]+)"};
                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
                std::vector<std::string> split_arg{it, {}};
-                GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+                GGML_ASSERT(split_arg.size() <= llama_max_devices());
-                std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+                std::vector<float> tensor_split(llama_max_devices());
-                for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+                for (size_t i = 0; i < llama_max_devices(); ++i) {
                    if (i < split_arg.size()) {
                        tensor_split[i] = std::stof(split_arg[i]);
                    } else {
@ -321,11 +455,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
-    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
+    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
    return params;
@ -336,19 +474,25 @@ struct cmd_params_instance {
    int n_prompt;
    int n_gen;
    int n_batch;
-    bool f32_kv;
+    ggml_type type_k;
    ggml_type type_v;
    int n_threads;
    int n_gpu_layers;
    llama_split_mode split_mode;
    int main_gpu;
-    bool mul_mat_q;
+    bool no_kv_offload;
-    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
    llama_model_params to_llama_mparams() const {
        llama_model_params mparams = llama_model_default_params();
        mparams.n_gpu_layers = n_gpu_layers;
        mparams.split_mode = split_mode;
        mparams.main_gpu = main_gpu;
        mparams.tensor_split = tensor_split.data();
        mparams.use_mmap = use_mmap;
        return mparams;
    }
@ -356,7 +500,9 @@ struct cmd_params_instance {
    bool equal_mparams(const cmd_params_instance & other) const {
        return model == other.model &&
               n_gpu_layers == other.n_gpu_layers &&
               split_mode == other.split_mode &&
               main_gpu == other.main_gpu &&
               use_mmap == other.use_mmap &&
               tensor_split == other.tensor_split;
    }
@ -365,53 +511,30 @@ struct cmd_params_instance {
        cparams.n_ctx = n_prompt + n_gen;
        cparams.n_batch = n_batch;
-        cparams.f16_kv = !f32_kv;
+        cparams.type_k = type_k;
-        cparams.mul_mat_q = mul_mat_q;
+        cparams.type_v = type_v;
        cparams.offload_kqv = !no_kv_offload;
        cparams.embeddings = embeddings;
        return cparams;
    }
 };
 static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
    std::vector<cmd_params_instance> instances;
    for (const auto & m : params.model)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
    for (const auto & fk : params.f32_kv)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
            /* .model        = */ m,
            /* .n_prompt     = */ n_prompt,
            /* .n_gen        = */ n_gen,
            /* .n_batch      = */ nb,
            /* .f32_kv       = */ fk,
            /* .n_threads    = */ nt,
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
            /* .mul_mat_q    = */ mmq,
            /* .tensor_split = */ ts,
        };
        instances.push_back(instance);
    }
    return instances;
 }
 static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
    std::vector<cmd_params_instance> instances;
 #if 1
    // this ordering minimizes the number of times that each model needs to be reloaded
    for (const auto & m : params.model)
    for (const auto & nl : params.n_gpu_layers)
    for (const auto & sm : params.split_mode)
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & mmp : params.use_mmap)
    for (const auto & embd : params.embeddings)
    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
-    for (const auto & mmq : params.mul_mat_q)
+    for (const auto & tv : params.type_v)
    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
@ -422,12 +545,16 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
-                /* .mul_mat_q    = */ mmq,
+                /* .no_kv_offload= */ nkvo,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
            };
            instances.push_back(instance);
        }
@ -441,34 +568,20 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
-                /* .mul_mat_q    = */ mmq,
+                /* .no_kv_offload= */ nkvo,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
            };
            instances.push_back(instance);
        }
    }
 #else
    // this ordering separates the prompt and generation tests
    for (const auto & n_prompt : params.n_prompt) {
        if (n_prompt == 0) {
            continue;
        }
        auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
        instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
    }
    for (const auto & n_gen : params.n_gen) {
        if (n_gen == 0) {
            continue;
        }
        auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
        instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
    }
 #endif
    return instances;
 }
@ -478,7 +591,10 @@ struct test {
    static const int build_number;
    static const bool cuda;
    static const bool opencl;
    static const bool vulkan;
    static const bool kompute;
    static const bool metal;
    static const bool sycl;
    static const bool gpu_blas;
    static const bool blas;
    static const std::string cpu_info;
@ -489,11 +605,15 @@ struct test {
    uint64_t model_n_params;
    int n_batch;
    int n_threads;
-    bool f32_kv;
+    ggml_type type_k;
    ggml_type type_v;
    int n_gpu_layers;
    llama_split_mode split_mode;
    int main_gpu;
-    bool mul_mat_q;
+    bool no_kv_offload;
-    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
    int n_prompt;
    int n_gen;
    std::string test_time;
@ -508,11 +628,15 @@ struct test {
        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
-        f32_kv = inst.f32_kv;
+        type_k = inst.type_k;
        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
        split_mode = inst.split_mode;
        main_gpu = inst.main_gpu;
-        mul_mat_q = inst.mul_mat_q;
+        no_kv_offload = inst.no_kv_offload;
        tensor_split = inst.tensor_split;
        use_mmap = inst.use_mmap;
        embeddings = inst.embeddings;
        n_prompt = inst.n_prompt;
        n_gen = inst.n_gen;
        // RFC 3339 date-time format
@ -553,26 +677,38 @@ struct test {
        if (opencl) {
            return "OpenCL";
        }
        if (vulkan) {
            return "Vulkan";
        }
        if (kompute) {
            return "Kompute";
        }
        if (metal) {
            return "Metal";
        }
        if (sycl) {
            return GGML_SYCL_NAME;
        }
        if (gpu_blas) {
            return "GPU BLAS";
        }
        if (blas) {
            return "BLAS";
        }
        return "CPU";
    }
    static const std::vector<std::string> & get_fields() {
        static const std::vector<std::string> fields = {
            "build_commit", "build_number",
-            "cuda", "opencl", "metal", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "f16_kv",
+            "n_batch", "n_threads", "type_k", "type_v",
-            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
+            "n_gpu_layers", "split_mode",
            "main_gpu", "no_kv_offload",
            "tensor_split", "use_mmap", "embeddings",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
@ -590,8 +726,9 @@ struct test {
            field == "avg_ns" || field == "stddev_ns") {
            return INT;
        }
-        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
+        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
-            field == "f16_kv" || field == "mul_mat_q") {
+            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
            field == "use_mmap" || field == "embeddings") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@ -603,7 +740,7 @@ struct test {
    std::vector<std::string> get_values() const {
        std::string tensor_split_str;
        int max_nonzero = 0;
-        for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
+        for (size_t i = 0; i < llama_max_devices(); i++) {
            if (tensor_split[i] > 0) {
                max_nonzero = i;
            }
@ -618,11 +755,14 @@ struct test {
        }
        std::vector<std::string> values = {
            build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
            std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
-            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
+            std::to_string(n_gpu_layers), split_mode_str(split_mode),
            std::to_string(main_gpu), std::to_string(no_kv_offload),
            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
@ -644,9 +784,12 @@ const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::sycl         = !!ggml_cpu_has_sycl();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
@ -777,11 +920,20 @@ struct markdown_printer : public printer {
        if (field == "n_gpu_layers") {
            return "ngl";
        }
        if (field == "split_mode") {
            return "sm";
        }
        if (field == "n_threads") {
            return "threads";
        }
-        if (field == "mul_mat_q") {
+        if (field == "no_kv_offload") {
-            return "mmq";
+            return "nkvo";
        }
        if (field == "use_mmap") {
            return "mmap";
        }
        if (field == "embeddings") {
            return "embd";
        }
        if (field == "tensor_split") {
            return "ts";
@ -791,34 +943,46 @@ struct markdown_printer : public printer {
    void print_header(const cmd_params & params) override {
        // select fields to print
-        fields.push_back("model");
+        fields.emplace_back("model");
-        fields.push_back("size");
+        fields.emplace_back("size");
-        fields.push_back("params");
+        fields.emplace_back("params");
-        fields.push_back("backend");
+        fields.emplace_back("backend");
        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
        if (!is_cpu_backend) {
-            fields.push_back("n_gpu_layers");
+            fields.emplace_back("n_gpu_layers");
        }
        if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
-            fields.push_back("n_threads");
+            fields.emplace_back("n_threads");
        }
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
-            fields.push_back("n_batch");
+            fields.emplace_back("n_batch");
        }
-        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
+        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
-            fields.push_back("f16_kv");
+            fields.emplace_back("type_k");
        }
        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
            fields.emplace_back("type_v");
        }
        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
-            fields.push_back("main_gpu");
+            fields.emplace_back("main_gpu");
        }
-        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
+        if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
-            fields.push_back("mul_mat_q");
+            fields.emplace_back("split_mode");
        }
        if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
            fields.emplace_back("no_kv_offload");
        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
-            fields.push_back("tensor_split");
+            fields.emplace_back("tensor_split");
        }
-        fields.push_back("test");
+        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
-        fields.push_back("t/s");
+            fields.emplace_back("use_mmap");
        }
        if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
            fields.emplace_back("embeddings");
        }
        fields.emplace_back("test");
        fields.emplace_back("t/s");
        fprintf(fout, "|");
        for (const auto & field : fields) {
@ -982,8 +1146,7 @@ int main(int argc, char ** argv) {
    if (!params.verbose) {
        llama_log_set(llama_null_log_callback, NULL);
    }
-    bool numa = false;
+    llama_backend_init();
    llama_backend_init(numa);
    // initialize printer
    std::unique_ptr<printer> p;
--- a/examples/llama.android/.gitignore
+++ b/examples/llama.android/.gitignore
@ -0,0 +1,33 @@
 # Gradle files
 .gradle/
 build/
 # Local configuration file (sdk path, etc)
 local.properties
 # Log/OS Files
 *.log
 # Android Studio generated files and folders
 captures/
 .externalNativeBuild/
 .cxx/
 *.apk
 output.json
 # IntelliJ
 *.iml
 .idea/
 misc.xml
 deploymentTargetDropDown.xml
 render.experimental.xml
 # Keystore files
 *.jks
 *.keystore
 # Google Services (e.g. APIs or Firebase)
 google-services.json
 # Android Profiling
 *.hprof
--- a/examples/llama.android/README.md
+++ b/examples/llama.android/README.md
--- a/examples/llama.android/app/.gitignore
+++ b/examples/llama.android/app/.gitignore
@ -0,0 +1 @@
 /build
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@ -0,0 +1,88 @@
 plugins {
    id("com.android.application")
    id("org.jetbrains.kotlin.android")
 }
 android {
    namespace = "com.example.llama"
    compileSdk = 34
    ndkVersion = "26.1.10909125"
    defaultConfig {
        applicationId = "com.example.llama"
        minSdk = 33
        targetSdk = 34
        versionCode = 1
        versionName = "1.0"
        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
        vectorDrawables {
            useSupportLibrary = true
        }
        ndk {
            // Add NDK properties if wanted, e.g.
            // abiFilters += listOf("arm64-v8a")
        }
        externalNativeBuild {
            cmake {
                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
            }
        }
    }
    buildTypes {
        release {
            isMinifyEnabled = false
            proguardFiles(
                getDefaultProguardFile("proguard-android-optimize.txt"),
                "proguard-rules.pro"
            )
        }
    }
    compileOptions {
        sourceCompatibility = JavaVersion.VERSION_1_8
        targetCompatibility = JavaVersion.VERSION_1_8
    }
    kotlinOptions {
        jvmTarget = "1.8"
    }
    buildFeatures {
        compose = true
    }
    composeOptions {
        kotlinCompilerExtensionVersion = "1.5.1"
    }
    packaging {
        resources {
            excludes += "/META-INF/{AL2.0,LGPL2.1}"
        }
    }
    externalNativeBuild {
        cmake {
            path = file("src/main/cpp/CMakeLists.txt")
            version = "3.22.1"
        }
    }
 }
 dependencies {
    implementation("androidx.core:core-ktx:1.12.0")
    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
    implementation("androidx.activity:activity-compose:1.8.2")
    implementation(platform("androidx.compose:compose-bom:2023.08.00"))
    implementation("androidx.compose.ui:ui")
    implementation("androidx.compose.ui:ui-graphics")
    implementation("androidx.compose.ui:ui-tooling-preview")
    implementation("androidx.compose.material3:material3")
    testImplementation("junit:junit:4.13.2")
    androidTestImplementation("androidx.test.ext:junit:1.1.5")
    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
    androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
    androidTestImplementation("androidx.compose.ui:ui-test-junit4")
    debugImplementation("androidx.compose.ui:ui-tooling")
    debugImplementation("androidx.compose.ui:ui-test-manifest")
 }
--- a/examples/llama.android/app/proguard-rules.pro
+++ b/examples/llama.android/app/proguard-rules.pro
@ -0,0 +1,21 @@
 # Add project specific ProGuard rules here.
 # You can control the set of applied configuration files using the
 # proguardFiles setting in build.gradle.
 #
 # For more details, see
 #   http://developer.android.com/guide/developing/tools/proguard.html
 # If your project uses WebView with JS, uncomment the following
 # and specify the fully qualified class name to the JavaScript interface
 # class:
 #-keepclassmembers class fqcn.of.javascript.interface.for.webview {
 #   public *;
 #}
 # Uncomment this to preserve the line number information for
 # debugging stack traces.
 #-keepattributes SourceFile,LineNumberTable
 # If you keep the line number information, uncomment this to
 # hide the original source file name.
 #-renamesourcefileattribute SourceFile
--- a/examples/llama.android/app/src/main/AndroidManifest.xml
+++ b/examples/llama.android/app/src/main/AndroidManifest.xml
@ -0,0 +1,30 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
    xmlns:tools="http://schemas.android.com/tools">
    <uses-permission android:name="android.permission.INTERNET" />
    <application
        android:allowBackup="true"
        android:dataExtractionRules="@xml/data_extraction_rules"
        android:fullBackupContent="@xml/backup_rules"
        android:icon="@mipmap/ic_launcher"
        android:label="@string/app_name"
        android:roundIcon="@mipmap/ic_launcher_round"
        android:supportsRtl="true"
        android:theme="@style/Theme.LlamaAndroid"
        >
        <activity
            android:name=".MainActivity"
            android:exported="true"
            android:theme="@style/Theme.LlamaAndroid">
            <intent-filter>
                <action android:name="android.intent.action.MAIN" />
                <category android:name="android.intent.category.LAUNCHER" />
            </intent-filter>
        </activity>
    </application>
 </manifest>
--- a/examples/llama.android/app/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
@ -0,0 +1,50 @@
 # For more information about using CMake with Android Studio, read the
 # documentation: https://d.android.com/studio/projects/add-native-code.html.
 # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
 # Sets the minimum CMake version required for this project.
 cmake_minimum_required(VERSION 3.22.1)
 # Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
 # Since this is the top level CMakeLists.txt, the project name is also accessible
 # with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
 # build script scope).
 project("llama-android")
 include(FetchContent)
 FetchContent_Declare(
        llama
        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
        GIT_TAG        master
 )
 # Also provides "common"
 FetchContent_MakeAvailable(llama)
 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
 # You can define multiple libraries, and CMake builds them for you.
 # Gradle automatically packages shared libraries with your APK.
 #
 # In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
 # the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
 # is preferred for the same purpose.
 #
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
 # for GameActivity/NativeActivity derived applications, the same library name must be
 # used in the AndroidManifest.xml file.
 add_library(${CMAKE_PROJECT_NAME} SHARED
    # List C/C++ source files with relative paths to this CMakeLists.txt.
    llama-android.cpp)
 # Specifies libraries CMake should link to your target library. You
 # can link libraries from various origins, such as libraries defined in this
 # build script, prebuilt third-party libraries, or Android system libraries.
 target_link_libraries(${CMAKE_PROJECT_NAME}
    # List libraries link to the target library
    llama
    common
    android
    log)
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@ -0,0 +1,394 @@
 #include <android/log.h>
 #include <jni.h>
 #include <iomanip>
 #include <math.h>
 #include <string>
 #include <unistd.h>
 #include "llama.h"
 #include "common/common.h"
 // Write C++ code here.
 //
 // Do not forget to dynamically load the C++ library into your application.
 //
 // For instance,
 //
 // In MainActivity.java:
 //    static {
 //       System.loadLibrary("llama-android");
 //    }
 //
 // Or, in MainActivity.kt:
 //    companion object {
 //      init {
 //         System.loadLibrary("llama-android")
 //      }
 //    }
 #define TAG "llama-android.cpp"
 #define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
 #define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
 jclass la_int_var;
 jmethodID la_int_var_value;
 jmethodID la_int_var_inc;
 static void log_callback(ggml_log_level level, const char * fmt, void * data) {
    if (level == GGML_LOG_LEVEL_ERROR)     __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
    else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
    else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
    else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
 }
 extern "C"
 JNIEXPORT jlong JNICALL
 Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
    llama_model_params model_params = llama_model_default_params();
    auto path_to_model = env->GetStringUTFChars(filename, 0);
    LOGi("Loading model from %s", path_to_model);
    auto model = llama_load_model_from_file(path_to_model, model_params);
    env->ReleaseStringUTFChars(filename, path_to_model);
    if (!model) {
        LOGe("load_model() failed");
        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
        return 0;
    }
    return reinterpret_cast<jlong>(model);
 }
 extern "C"
 JNIEXPORT void JNICALL
 Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
    llama_free_model(reinterpret_cast<llama_model *>(model));
 }
 extern "C"
 JNIEXPORT jlong JNICALL
 Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
    auto model = reinterpret_cast<llama_model *>(jmodel);
    if (!model) {
        LOGe("new_context(): model cannot be null");
        env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
        return 0;
    }
    int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
    LOGi("Using %d threads", n_threads);
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.seed  = 1234;
    ctx_params.n_ctx = 2048;
    ctx_params.n_threads       = n_threads;
    ctx_params.n_threads_batch = n_threads;
    llama_context * context = llama_new_context_with_model(model, ctx_params);
    if (!context) {
        LOGe("llama_new_context_with_model() returned null)");
        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
                      "llama_new_context_with_model() returned null)");
        return 0;
    }
    return reinterpret_cast<jlong>(context);
 }
 extern "C"
 JNIEXPORT void JNICALL
 Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
    llama_free(reinterpret_cast<llama_context *>(context));
 }
 extern "C"
 JNIEXPORT void JNICALL
 Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
    llama_backend_free();
 }
 extern "C"
 JNIEXPORT void JNICALL
 Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
    llama_log_set(log_callback, NULL);
 }
 extern "C"
 JNIEXPORT jstring JNICALL
 Java_com_example_llama_Llm_bench_1model(
        JNIEnv *env,
        jobject,
        jlong context_pointer,
        jlong model_pointer,
        jlong batch_pointer,
        jint pp,
        jint tg,
        jint pl,
        jint nr
        ) {
    auto pp_avg = 0.0;
    auto tg_avg = 0.0;
    auto pp_std = 0.0;
    auto tg_std = 0.0;
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
    const auto model = reinterpret_cast<llama_model *>(model_pointer);
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
    const int n_ctx = llama_n_ctx(context);
    LOGi("n_ctx = %d", n_ctx);
    int i, j;
    int nri;
    for (nri = 0; nri < nr; nri++) {
        LOGi("Benchmark prompt processing (pp)");
        llama_batch_clear(*batch);
        const int n_tokens = pp;
        for (i = 0; i < n_tokens; i++) {
            llama_batch_add(*batch, 0, i, { 0 }, false);
        }
        batch->logits[batch->n_tokens - 1] = true;
        llama_kv_cache_clear(context);
        const auto t_pp_start = ggml_time_us();
        if (llama_decode(context, *batch) != 0) {
            LOGi("llama_decode() failed during prompt processing");
        }
        const auto t_pp_end = ggml_time_us();
        // bench text generation
        LOGi("Benchmark text generation (tg)");
        llama_kv_cache_clear(context);
        const auto t_tg_start = ggml_time_us();
        for (i = 0; i < tg; i++) {
            llama_batch_clear(*batch);
            for (j = 0; j < pl; j++) {
                llama_batch_add(*batch, 0, i, { j }, true);
            }
            LOGi("llama_decode() text generation: %d", i);
            if (llama_decode(context, *batch) != 0) {
                LOGi("llama_decode() failed during text generation");
            }
        }
        const auto t_tg_end = ggml_time_us();
        llama_kv_cache_clear(context);
        const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
        const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
        const auto speed_pp = double(pp) / t_pp;
        const auto speed_tg = double(pl * tg) / t_tg;
        pp_avg += speed_pp;
        tg_avg += speed_tg;
        pp_std += speed_pp * speed_pp;
        tg_std += speed_tg * speed_tg;
        LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
    }
    pp_avg /= double(nr);
    tg_avg /= double(nr);
    if (nr > 1) {
        pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
        tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
    } else {
        pp_std = 0;
        tg_std = 0;
    }
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    const auto model_size     = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
    const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
    const auto backend    = "(Android)"; // TODO: What should this be?
    std::stringstream result;
    result << std::setprecision(2);
    result << "| model | size | params | backend | test | t/s |\n";
    result << "| --- | --- | --- | --- | --- | --- |\n";
    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
    return env->NewStringUTF(result.str().c_str());
 }
 extern "C"
 JNIEXPORT void JNICALL
 Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
 }
 extern "C"
 JNIEXPORT jlong JNICALL
 Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
    llama_batch *batch = new llama_batch {
        0,
        nullptr,
        nullptr,
        nullptr,
        nullptr,
        nullptr,
        nullptr,
        0,
        0,
        0,
    };
    if (embd) {
        batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
    } else {
        batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
    }
    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
    batch->n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
    batch->seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
    for (int i = 0; i < n_tokens; ++i) {
        batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
    }
    batch->logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
    return reinterpret_cast<jlong>(batch);
 }
 extern "C"
 JNIEXPORT void JNICALL
 Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
    llama_backend_init();
 }
 extern "C"
 JNIEXPORT jstring JNICALL
 Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
    return env->NewStringUTF(llama_print_system_info());
 }
 extern "C"
 JNIEXPORT jint JNICALL
 Java_com_example_llama_Llm_completion_1init(
        JNIEnv *env,
        jobject,
        jlong context_pointer,
        jlong batch_pointer,
        jstring jtext,
        jint n_len
    ) {
    const auto text = env->GetStringUTFChars(jtext, 0);
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
    const auto tokens_list = llama_tokenize(context, text, 1);
    auto n_ctx = llama_n_ctx(context);
    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
    LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
    if (n_kv_req > n_ctx) {
        LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
    }
    for (auto id : tokens_list) {
        LOGi("%s", llama_token_to_piece(context, id).c_str());
    }
    llama_batch_clear(*batch);
    // evaluate the initial prompt
    for (auto i = 0; i < tokens_list.size(); i++) {
        llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
    }
    // llama_decode will output logits only for the last token of the prompt
    batch->logits[batch->n_tokens - 1] = true;
    if (llama_decode(context, *batch) != 0) {
        LOGe("llama_decode() failed");
    }
    env->ReleaseStringUTFChars(jtext, text);
    return batch->n_tokens;
 }
 extern "C"
 JNIEXPORT jstring JNICALL
 Java_com_example_llama_Llm_completion_1loop(
        JNIEnv * env,
        jobject,
        jlong context_pointer,
        jlong batch_pointer,
        jint n_len,
        jobject intvar_ncur
 ) {
    const auto context = reinterpret_cast<llama_context *>(context_pointer);
    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
    const auto model = llama_get_model(context);
    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
    auto n_vocab = llama_n_vocab(model);
    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
    }
    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    // sample the most likely token
    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
        return env->NewStringUTF("");
    }
    auto new_token_chars = llama_token_to_piece(context, new_token_id);
    LOGi("new_token_chars: `%s`", new_token_chars.c_str());
    auto new_token = env->NewStringUTF(new_token_chars.c_str());
    llama_batch_clear(*batch);
    llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
    env->CallVoidMethod(intvar_ncur, la_int_var_inc);
    if (llama_decode(context, *batch) != 0) {
        LOGe("llama_decode() returned null");
    }
    return new_token;
 }
 extern "C"
 JNIEXPORT void JNICALL
 Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
@ -0,0 +1,119 @@
 package com.example.llama
 import android.app.DownloadManager
 import android.net.Uri
 import android.util.Log
 import androidx.compose.material3.Button
 import androidx.compose.material3.Text
 import androidx.compose.runtime.Composable
 import androidx.compose.runtime.getValue
 import androidx.compose.runtime.mutableDoubleStateOf
 import androidx.compose.runtime.mutableStateOf
 import androidx.compose.runtime.remember
 import androidx.compose.runtime.rememberCoroutineScope
 import androidx.compose.runtime.setValue
 import androidx.core.database.getLongOrNull
 import androidx.core.net.toUri
 import kotlinx.coroutines.delay
 import kotlinx.coroutines.launch
 import java.io.File
 data class Downloadable(val name: String, val source: Uri, val destination: File) {
    companion object {
        @JvmStatic
        private val tag: String? = this::class.qualifiedName
        sealed interface State
        data object Ready: State
        data class Downloading(val id: Long): State
        data class Downloaded(val downloadable: Downloadable): State
        data class Error(val message: String): State
        @JvmStatic
        @Composable
        fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
            var status: State by remember {
                mutableStateOf(
                    if (item.destination.exists()) Downloaded(item)
                    else Ready
                )
            }
            var progress by remember { mutableDoubleStateOf(0.0) }
            val coroutineScope = rememberCoroutineScope()
            suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
                while (true) {
                    val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
                    if (cursor == null) {
                        Log.e(tag, "dm.query() returned null")
                        return Error("dm.query() returned null")
                    }
                    if (!cursor.moveToFirst() || cursor.count < 1) {
                        cursor.close()
                        Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
                        return Ready
                    }
                    val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
                    val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
                    val sofar = cursor.getLongOrNull(pix) ?: 0
                    val total = cursor.getLongOrNull(tix) ?: 1
                    cursor.close()
                    if (sofar == total) {
                        return Downloaded(item)
                    }
                    progress = (sofar * 1.0) / total
                    delay(1000L)
                }
            }
            fun onClick() {
                when (val s = status) {
                    is Downloaded -> {
                        viewModel.load(item.destination.path)
                    }
                    is Downloading -> {
                        coroutineScope.launch {
                            status = waitForDownload(s, item)
                        }
                    }
                    else -> {
                        item.destination.delete()
                        val request = DownloadManager.Request(item.source).apply {
                            setTitle("Downloading model")
                            setDescription("Downloading model: ${item.name}")
                            setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
                            setDestinationUri(item.destination.toUri())
                        }
                        viewModel.log("Saving ${item.name} to ${item.destination.path}")
                        Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
                        val id = dm.enqueue(request)
                        status = Downloading(id)
                        onClick()
                    }
                }
            }
            Button(onClick = { onClick() }, enabled = status !is Downloading) {
                when (status) {
                    is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
                    is Downloaded -> Text("Load ${item.name}")
                    is Ready -> Text("Download ${item.name}")
                    is Error -> Text("Download ${item.name}")
                }
            }
        }
    }
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
@ -0,0 +1,172 @@
 package com.example.llama
 import android.util.Log
 import kotlinx.coroutines.CoroutineDispatcher
 import kotlinx.coroutines.asCoroutineDispatcher
 import kotlinx.coroutines.flow.Flow
 import kotlinx.coroutines.flow.flow
 import kotlinx.coroutines.flow.flowOn
 import kotlinx.coroutines.withContext
 import java.util.concurrent.Executors
 import kotlin.concurrent.thread
 class Llm {
    private val tag: String? = this::class.simpleName
    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
    private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
        thread(start = false, name = "Llm-RunLoop") {
            Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
            // No-op if called more than once.
            System.loadLibrary("llama-android")
            // Set llama log handler to Android
            log_to_android()
            backend_init(false)
            Log.d(tag, system_info())
            it.run()
        }.apply {
            uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
                Log.e(tag, "Unhandled exception", exception)
            }
        }
    }.asCoroutineDispatcher()
    private val nlen: Int = 64
    private external fun log_to_android()
    private external fun load_model(filename: String): Long
    private external fun free_model(model: Long)
    private external fun new_context(model: Long): Long
    private external fun free_context(context: Long)
    private external fun backend_init(numa: Boolean)
    private external fun backend_free()
    private external fun free_batch(batch: Long)
    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
    private external fun bench_model(
        context: Long,
        model: Long,
        batch: Long,
        pp: Int,
        tg: Int,
        pl: Int,
        nr: Int
    ): String
    private external fun system_info(): String
    private external fun completion_init(
        context: Long,
        batch: Long,
        text: String,
        nLen: Int
    ): Int
    private external fun completion_loop(
        context: Long,
        batch: Long,
        nLen: Int,
        ncur: IntVar
    ): String
    private external fun kv_cache_clear(context: Long)
    suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
        return withContext(runLoop) {
            when (val state = threadLocalState.get()) {
                is State.Loaded -> {
                    Log.d(tag, "bench(): $state")
                    bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
                }
                else -> throw IllegalStateException("No model loaded")
            }
        }
    }
    suspend fun load(pathToModel: String) {
        withContext(runLoop) {
            when (threadLocalState.get()) {
                is State.Idle -> {
                    val model = load_model(pathToModel)
                    if (model == 0L)  throw IllegalStateException("load_model() failed")
                    val context = new_context(model)
                    if (context == 0L) throw IllegalStateException("new_context() failed")
                    val batch = new_batch(512, 0, 1)
                    if (batch == 0L) throw IllegalStateException("new_batch() failed")
                    Log.i(tag, "Loaded model $pathToModel")
                    threadLocalState.set(State.Loaded(model, context, batch))
                }
                else -> throw IllegalStateException("Model already loaded")
            }
        }
    }
    fun send(message: String): Flow<String> = flow {
        when (val state = threadLocalState.get()) {
            is State.Loaded -> {
                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
                while (ncur.value <= nlen) {
                    val str = completion_loop(state.context, state.batch, nlen, ncur)
                    if (str.isEmpty()) {
                        break
                    }
                    emit(str)
                }
                kv_cache_clear(state.context)
            }
            else -> {}
        }
    }.flowOn(runLoop)
    /**
     * Unloads the model and frees resources.
     *
     * This is a no-op if there's no model loaded.
     */
    suspend fun unload() {
        withContext(runLoop) {
            when (val state = threadLocalState.get()) {
                is State.Loaded -> {
                    free_context(state.context)
                    free_model(state.model)
                    free_batch(state.batch)
                    threadLocalState.set(State.Idle)
                }
                else -> {}
            }
        }
    }
    companion object {
        private class IntVar(value: Int) {
            @Volatile
            var value: Int = value
                private set
            fun inc() {
                synchronized(this) {
                    value += 1
                }
            }
        }
        private sealed interface State {
            data object Idle: State
            data class Loaded(val model: Long, val context: Long, val batch: Long): State
        }
        // Enforce only one instance of Llm.
        private val _instance: Llm = Llm()
        fun instance(): Llm = _instance
    }
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@ -0,0 +1,154 @@
 package com.example.llama
 import android.app.ActivityManager
 import android.app.DownloadManager
 import android.content.ClipData
 import android.content.ClipboardManager
 import android.net.Uri
 import android.os.Bundle
 import android.os.StrictMode
 import android.os.StrictMode.VmPolicy
 import android.text.format.Formatter
 import androidx.activity.ComponentActivity
 import androidx.activity.compose.setContent
 import androidx.activity.viewModels
 import androidx.compose.foundation.layout.Box
 import androidx.compose.foundation.layout.Column
 import androidx.compose.foundation.layout.Row
 import androidx.compose.foundation.layout.fillMaxSize
 import androidx.compose.foundation.layout.padding
 import androidx.compose.foundation.lazy.LazyColumn
 import androidx.compose.foundation.lazy.items
 import androidx.compose.foundation.lazy.rememberLazyListState
 import androidx.compose.material3.Button
 import androidx.compose.material3.LocalContentColor
 import androidx.compose.material3.MaterialTheme
 import androidx.compose.material3.OutlinedTextField
 import androidx.compose.material3.Surface
 import androidx.compose.material3.Text
 import androidx.compose.runtime.Composable
 import androidx.compose.ui.Modifier
 import androidx.compose.ui.unit.dp
 import androidx.core.content.getSystemService
 import com.example.llama.ui.theme.LlamaAndroidTheme
 import java.io.File
 class MainActivity(
    activityManager: ActivityManager? = null,
    downloadManager: DownloadManager? = null,
    clipboardManager: ClipboardManager? = null,
 ): ComponentActivity() {
    private val tag: String? = this::class.simpleName
    private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
    private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
    private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
    private val viewModel: MainViewModel by viewModels()
    // Get a MemoryInfo object for the device's current memory status.
    private fun availableMemory(): ActivityManager.MemoryInfo {
        return ActivityManager.MemoryInfo().also { memoryInfo ->
            activityManager.getMemoryInfo(memoryInfo)
        }
    }
    override fun onCreate(savedInstanceState: Bundle?) {
        super.onCreate(savedInstanceState)
        StrictMode.setVmPolicy(
            VmPolicy.Builder(StrictMode.getVmPolicy())
                .detectLeakedClosableObjects()
                .build()
        )
        val free = Formatter.formatFileSize(this, availableMemory().availMem)
        val total = Formatter.formatFileSize(this, availableMemory().totalMem)
        viewModel.log("Current memory: $free / $total")
        viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
        val extFilesDir = getExternalFilesDir(null)
        val models = listOf(
            Downloadable(
                "Phi-2 7B (Q4_0, 1.6 GiB)",
                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
                File(extFilesDir, "phi-2-q4_0.gguf"),
            ),
            Downloadable(
                "TinyLlama 1.1B (f16, 2.2 GiB)",
                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
            ),
            Downloadable(
                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
            ),
        )
        setContent {
            LlamaAndroidTheme {
                // A surface container using the 'background' color from the theme
                Surface(
                    modifier = Modifier.fillMaxSize(),
                    color = MaterialTheme.colorScheme.background
                ) {
                    MainCompose(
                        viewModel,
                        clipboardManager,
                        downloadManager,
                        models,
                    )
                }
            }
        }
    }
 }
@Composable
 fun MainCompose(
    viewModel: MainViewModel,
    clipboard: ClipboardManager,
    dm: DownloadManager,
    models: List<Downloadable>
 ) {
    Column {
        val scrollState = rememberLazyListState()
        Box(modifier = Modifier.weight(1f)) {
            LazyColumn(state = scrollState) {
                items(viewModel.messages) {
                    Text(
                        it,
                        style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
                        modifier = Modifier.padding(16.dp)
                    )
                }
            }
        }
        OutlinedTextField(
            value = viewModel.message,
            onValueChange = { viewModel.updateMessage(it) },
            label = { Text("Message") },
        )
        Row {
            Button({ viewModel.send() }) { Text("Send") }
            Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
            Button({ viewModel.clear() }) { Text("Clear") }
            Button({
                viewModel.messages.joinToString("\n").let {
                    clipboard.setPrimaryClip(ClipData.newPlainText("", it))
                }
            }) { Text("Copy") }
        }
        Column {
            for (model in models) {
                Downloadable.Button(viewModel, dm, model)
            }
        }
    }
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@ -0,0 +1,104 @@
 package com.example.llama
 import android.util.Log
 import androidx.compose.runtime.getValue
 import androidx.compose.runtime.mutableStateOf
 import androidx.compose.runtime.setValue
 import androidx.lifecycle.ViewModel
 import androidx.lifecycle.viewModelScope
 import kotlinx.coroutines.flow.catch
 import kotlinx.coroutines.launch
 class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
    companion object {
        @JvmStatic
        private val NanosPerSecond = 1_000_000_000.0
    }
    private val tag: String? = this::class.simpleName
    var messages by mutableStateOf(listOf("Initializing..."))
        private set
    var message by mutableStateOf("")
        private set
    override fun onCleared() {
        super.onCleared()
        viewModelScope.launch {
            try {
                llm.unload()
            } catch (exc: IllegalStateException) {
                messages += exc.message!!
            }
        }
    }
    fun send() {
        val text = message
        message = ""
        // Add to messages console.
        messages += text
        messages += ""
        viewModelScope.launch {
            llm.send(text)
                .catch {
                    Log.e(tag, "send() failed", it)
                    messages += it.message!!
                }
                .collect { messages = messages.dropLast(1) + (messages.last() + it) }
        }
    }
    fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
        viewModelScope.launch {
            try {
                val start = System.nanoTime()
                val warmupResult = llm.bench(pp, tg, pl, nr)
                val end = System.nanoTime()
                messages += warmupResult
                val warmup = (end - start).toDouble() / NanosPerSecond
                messages += "Warm up time: $warmup seconds, please wait..."
                if (warmup > 5.0) {
                    messages += "Warm up took too long, aborting benchmark"
                    return@launch
                }
                messages += llm.bench(512, 128, 1, 3)
            } catch (exc: IllegalStateException) {
                Log.e(tag, "bench() failed", exc)
                messages += exc.message!!
            }
        }
    }
    fun load(pathToModel: String) {
        viewModelScope.launch {
            try {
                llm.load(pathToModel)
                messages += "Loaded $pathToModel"
            } catch (exc: IllegalStateException) {
                Log.e(tag, "load() failed", exc)
                messages += exc.message!!
            }
        }
    }
    fun updateMessage(newMessage: String) {
        message = newMessage
    }
    fun clear() {
        messages = listOf()
    }
    fun log(message: String) {
        messages += message
    }
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
@ -0,0 +1,11 @@
 package com.example.llama.ui.theme
 import androidx.compose.ui.graphics.Color
 val Purple80 = Color(0xFFD0BCFF)
 val PurpleGrey80 = Color(0xFFCCC2DC)
 val Pink80 = Color(0xFFEFB8C8)
 val Purple40 = Color(0xFF6650a4)
 val PurpleGrey40 = Color(0xFF625b71)
 val Pink40 = Color(0xFF7D5260)
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
@ -0,0 +1,70 @@
 package com.example.llama.ui.theme
 import android.app.Activity
 import android.os.Build
 import androidx.compose.foundation.isSystemInDarkTheme
 import androidx.compose.material3.MaterialTheme
 import androidx.compose.material3.darkColorScheme
 import androidx.compose.material3.dynamicDarkColorScheme
 import androidx.compose.material3.dynamicLightColorScheme
 import androidx.compose.material3.lightColorScheme
 import androidx.compose.runtime.Composable
 import androidx.compose.runtime.SideEffect
 import androidx.compose.ui.graphics.toArgb
 import androidx.compose.ui.platform.LocalContext
 import androidx.compose.ui.platform.LocalView
 import androidx.core.view.WindowCompat
 private val DarkColorScheme = darkColorScheme(
    primary = Purple80,
    secondary = PurpleGrey80,
    tertiary = Pink80
 )
 private val LightColorScheme = lightColorScheme(
    primary = Purple40,
    secondary = PurpleGrey40,
    tertiary = Pink40
    /* Other default colors to override
    background = Color(0xFFFFFBFE),
    surface = Color(0xFFFFFBFE),
    onPrimary = Color.White,
    onSecondary = Color.White,
    onTertiary = Color.White,
    onBackground = Color(0xFF1C1B1F),
    onSurface = Color(0xFF1C1B1F),
    */
 )
@Composable
 fun LlamaAndroidTheme(
    darkTheme: Boolean = isSystemInDarkTheme(),
    // Dynamic color is available on Android 12+
    dynamicColor: Boolean = true,
    content: @Composable () -> Unit
 ) {
    val colorScheme = when {
        dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
            val context = LocalContext.current
            if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
        }
        darkTheme -> DarkColorScheme
        else -> LightColorScheme
    }
    val view = LocalView.current
    if (!view.isInEditMode) {
        SideEffect {
            val window = (view.context as Activity).window
            window.statusBarColor = colorScheme.primary.toArgb()
            WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
        }
    }
    MaterialTheme(
        colorScheme = colorScheme,
        typography = Typography,
        content = content
    )
 }
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
@ -0,0 +1,34 @@
 package com.example.llama.ui.theme
 import androidx.compose.material3.Typography
 import androidx.compose.ui.text.TextStyle
 import androidx.compose.ui.text.font.FontFamily
 import androidx.compose.ui.text.font.FontWeight
 import androidx.compose.ui.unit.sp
 // Set of Material typography styles to start with
 val Typography = Typography(
    bodyLarge = TextStyle(
        fontFamily = FontFamily.Default,
        fontWeight = FontWeight.Normal,
        fontSize = 16.sp,
        lineHeight = 24.sp,
        letterSpacing = 0.5.sp
    )
    /* Other default text styles to override
    titleLarge = TextStyle(
        fontFamily = FontFamily.Default,
        fontWeight = FontWeight.Normal,
        fontSize = 22.sp,
        lineHeight = 28.sp,
        letterSpacing = 0.sp
    ),
    labelSmall = TextStyle(
        fontFamily = FontFamily.Default,
        fontWeight = FontWeight.Medium,
        fontSize = 11.sp,
        lineHeight = 16.sp,
        letterSpacing = 0.5.sp
    )
    */
 )
--- a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
@ -0,0 +1,170 @@
 <?xml version="1.0" encoding="utf-8"?>
 <vector xmlns:android="http://schemas.android.com/apk/res/android"
    android:width="108dp"
    android:height="108dp"
    android:viewportWidth="108"
    android:viewportHeight="108">
    <path
        android:fillColor="#3DDC84"
        android:pathData="M0,0h108v108h-108z" />
    <path
        android:fillColor="#00000000"
        android:pathData="M9,0L9,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M19,0L19,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M29,0L29,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M39,0L39,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M49,0L49,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M59,0L59,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M69,0L69,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M79,0L79,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M89,0L89,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M99,0L99,108"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,9L108,9"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,19L108,19"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,29L108,29"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,39L108,39"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,49L108,49"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,59L108,59"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,69L108,69"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,79L108,79"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,89L108,89"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M0,99L108,99"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M19,29L89,29"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M19,39L89,39"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M19,49L89,49"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M19,59L89,59"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M19,69L89,69"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M19,79L89,79"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M29,19L29,89"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M39,19L39,89"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M49,19L49,89"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M59,19L59,89"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M69,19L69,89"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
    <path
        android:fillColor="#00000000"
        android:pathData="M79,19L79,89"
        android:strokeWidth="0.8"
        android:strokeColor="#33FFFFFF" />
 </vector>
--- a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
@ -0,0 +1,30 @@
 <vector xmlns:android="http://schemas.android.com/apk/res/android"
    xmlns:aapt="http://schemas.android.com/aapt"
    android:width="108dp"
    android:height="108dp"
    android:viewportWidth="108"
    android:viewportHeight="108">
    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
        <aapt:attr name="android:fillColor">
            <gradient
                android:endX="85.84757"
                android:endY="92.4963"
                android:startX="42.9492"
                android:startY="49.59793"
                android:type="linear">
                <item
                    android:color="#44000000"
                    android:offset="0.0" />
                <item
                    android:color="#00000000"
                    android:offset="1.0" />
            </gradient>
        </aapt:attr>
    </path>
    <path
        android:fillColor="#FFFFFF"
        android:fillType="nonZero"
        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
        android:strokeWidth="1"
        android:strokeColor="#00000000" />
 </vector>
--- a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
    <background android:drawable="@drawable/ic_launcher_background" />
    <foreground android:drawable="@drawable/ic_launcher_foreground" />
    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
 </adaptive-icon>
--- a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
    <background android:drawable="@drawable/ic_launcher_background" />
    <foreground android:drawable="@drawable/ic_launcher_foreground" />
    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
 </adaptive-icon>
--- a/Show more
+++ b/Show more