Merge branch 'master' of https://github.com/ggerganov/llama.cpp

2024-01-02 14:54:05 +07:00 · 2024-01-02 14:54:05 +07:00 · 6c46cb1da4
commit 6c46cb1da4
parent 9174699f84 edd1ab7bc3
58 changed files with 1982 additions and 885 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -14,7 +14,8 @@ ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git
-COPY requirements.txt requirements.txt
+COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -23,7 +23,8 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1101 \
    gfx1102
-COPY requirements.txt requirements.txt
+COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -5,7 +5,8 @@ FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git
-COPY requirements.txt requirements.txt
+COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -23,7 +23,8 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1101 \
    gfx1102
-COPY requirements.txt requirements.txt
+COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -0,0 +1,22 @@
 {
  perSystem =
    { config, lib, ... }:
    {
      apps =
        let
          inherit (config.packages) default;
          binaries = [
            "llama"
            "llama-embedding"
            "llama-server"
            "quantize"
            "train-text-from-scratch"
          ];
          mkApp = name: {
            type = "app";
            program = "${default}/bin/${name}";
          };
        in
        lib.genAttrs binaries mkApp;
    };
 }
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@ -0,0 +1,13 @@
 {
  perSystem =
    { config, lib, ... }:
    {
      devShells =
        lib.concatMapAttrs
          (name: package: {
            ${name} = package.passthru.shell;
            ${name + "-extra"} = package.passthru.shell-extra;
          })
          config.packages;
    };
 }
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@ -0,0 +1,39 @@
 { inputs, ... }:
 {
  perSystem =
    {
      config,
      system,
      lib,
      pkgsCuda,
      ...
    }:
    {
      legacyPackages =
        let
          caps.llamaPackagesXavier = "7.2";
          caps.llamaPackagesOrin = "8.7";
          caps.llamaPackagesTX2 = "6.2";
          caps.llamaPackagesNano = "5.3";
          pkgsFor =
            cap:
            import inputs.nixpkgs {
              inherit system;
              config = {
                cudaSupport = true;
                cudaCapabilities = [ cap ];
                cudaEnableForwardCompat = false;
                inherit (pkgsCuda.config) allowUnfreePredicate;
              };
            };
        in
        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
      packages = lib.optionalAttrs (system == "aarch64-linux") {
        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
      };
    };
 }
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@ -0,0 +1,35 @@
 { inputs, ... }:
 {
  # The _module.args definitions are passed on to modules as arguments. E.g.
  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
  # `_module.args.pkgs` (defined in this case by flake-parts).
  perSystem =
    { system, ... }:
    {
      _module.args = {
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
          # and ucx are built with CUDA support)
          config.cudaSupport = true;
          config.allowUnfreePredicate =
            p:
            builtins.all
              (
                license:
                license.free
                || builtins.elem license.shortName [
                  "CUDA EULA"
                  "cuDNN EULA"
                ]
              )
              (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
          inherit system;
          config.rocmSupport = true;
        };
      };
    };
 }
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -0,0 +1,265 @@
 {
  lib,
  config,
  stdenv,
  mkShell,
  cmake,
  ninja,
  pkg-config,
  git,
  python3,
  mpi,
  openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
  cudaPackages,
  darwin,
  rocmPackages,
  clblast,
  useBlas ? builtins.all (x: !x) [
    useCuda
    useMetalKit
    useOpenCL
    useRocm
  ],
  useCuda ? config.cudaSupport,
  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
  useMpi ? false, # Increases the runtime closure size by ~700M
  useOpenCL ? false,
  useRocm ? config.rocmSupport,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
 }@inputs:
 let
  inherit (lib)
    cmakeBool
    cmakeFeature
    optionals
    strings
    versionOlder
    ;
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  stdenv = throw "Use effectiveStdenv instead";
  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
  suffices =
    lib.optionals useBlas [ "BLAS" ]
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
    ++ lib.optionals useOpenCL [ "OpenCL" ]
    ++ lib.optionals useRocm [ "ROCm" ];
  pnameSuffix =
    strings.optionalString (suffices != [ ])
      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
  descriptionSuffix =
    strings.optionalString (suffices != [ ])
      ", accelerated with ${strings.concatStringsSep ", " suffices}";
  # TODO: package the Python in this repository in a Nix-like way.
  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
  # https://peps.python.org/pep-0517/
  llama-python = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
    ]
  );
  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
  llama-python-extra = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
      ps.torchWithoutCuda
      ps.transformers
    ]
  );
  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
  # separately
  darwinBuildInputs =
    with darwin.apple_sdk.frameworks;
    [
      Accelerate
      CoreVideo
      CoreGraphics
    ]
    ++ optionals useMetalKit [ MetalKit ];
  cudaBuildInputs = with cudaPackages; [
    cuda_cccl.dev # <nv/target>
    # A temporary hack for reducing the closure size, remove once cudaPackages
    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
    cuda_cudart.dev
    cuda_cudart.lib
    cuda_cudart.static
    libcublas.dev
    libcublas.lib
    libcublas.static
  ];
  rocmBuildInputs = with rocmPackages; [
    clr
    hipblas
    rocblas
  ];
 in
 effectiveStdenv.mkDerivation (
  finalAttrs: {
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;
    src = lib.cleanSourceWith {
      filter =
        name: type:
        !(builtins.any (_: _) [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
          (name == "README.md") # Ignore *.md changes whe computing outPaths
          (lib.hasPrefix "." name) # Skip hidden files and directories
        ]);
      src = lib.cleanSource ../../.;
    };
    postPatch = ''
      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
      # TODO: Package up each Python script or service appropriately.
      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
      # we could make those *.py into setuptools' entrypoints
      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
    '';
    nativeBuildInputs =
      [
        cmake
        ninja
        pkg-config
        git
      ]
      ++ optionals useCuda [
        cudaPackages.cuda_nvcc
        # TODO: Replace with autoAddDriverRunpath
        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
        cudaPackages.autoAddOpenGLRunpathHook
      ];
    buildInputs =
      optionals effectiveStdenv.isDarwin darwinBuildInputs
      ++ optionals useCuda cudaBuildInputs
      ++ optionals useMpi [ mpi ]
      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs;
    cmakeFlags =
      [
        (cmakeBool "LLAMA_NATIVE" true)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
        (cmakeBool "LLAMA_CUBLAS" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
      ]
      ++ optionals useCuda [
        (
          with cudaPackages.flags;
          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
          )
        )
      ]
      ++ optionals useRocm [
        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
        # and select the line that matches the current nixpkgs version of rocBLAS.
        # Should likely use `rocmPackages.clr.gpuTargets`.
        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      ]
      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
    postInstall = ''
      mv $out/bin/main $out/bin/llama
      mv $out/bin/server $out/bin/llama-server
      mkdir -p $out/include
      cp $src/llama.h $out/include/
    '';
    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
    passthru = {
      inherit
        useBlas
        useCuda
        useMetalKit
        useMpi
        useOpenCL
        useRocm
        ;
      shell = mkShell {
        name = "shell-${finalAttrs.finalPackage.name}";
        description = "contains numpy and sentencepiece";
        buildInputs = [ llama-python ];
        inputsFrom = [ finalAttrs.finalPackage ];
      };
      shell-extra = mkShell {
        name = "shell-extra-${finalAttrs.finalPackage.name}";
        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
        buildInputs = [ llama-python-extra ];
        inputsFrom = [ finalAttrs.finalPackage ];
      };
    };
    meta = {
      # Configurations we don't want even the CI to evaluate. Results in the
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
      # Configurations that are known to result in build failures. Can be
      # overridden by importing Nixpkgs with `allowBroken = true`.
      broken = (useMetalKit && !effectiveStdenv.isDarwin);
      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
      homepage = "https://github.com/ggerganov/llama.cpp/";
      license = lib.licenses.mit;
      # Accommodates `nix run` and `lib.getExe`
      mainProgram = "llama";
      # These people might respond, on the best effort basis, if you ping them
      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
      # Consider adding yourself to this list if you want to ensure this flake
      # stays maintained and you're willing to invest your time. Do not add
      # other people without their consent. Consider removing people after
      # they've been unreachable for long periods of time.
      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
      # an attrset following the same format as in
      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
      maintainers = with lib.maintainers; [
        philiptaron
        SomeoneSerge
      ];
      # Extend `badPlatforms` instead
      platforms = lib.platforms.all;
    };
  }
 )
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -0,0 +1,12 @@
 {
  lib,
  newScope,
  llamaVersion ? "0.0.0",
 }:
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
    llama-cpp = self.callPackage ./package.nix { };
  }
 )
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -515,7 +515,6 @@ jobs:
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -0,0 +1,112 @@
 name: Nix CI
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
 jobs:
  nix-eval:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: List all flake outputs
      run: nix flake show --all-systems
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
    if: ${{ vars.CACHIX_NAME != '' }}
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: ${{ vars.CACHIX_NAME }}
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --flake
          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build-aarch64:
    if: ${{ vars.CACHIX_NAME != '' }}
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install QEMU
      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
      run: |
        sudo apt-get install -y qemu-user-static qemu-system-aarch64
        sudo usermod -a -G kvm $USER
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: ${{ vars.CACHIX_NAME }}
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.aarch64-linux"
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --systems aarch64-linux
          --flake
          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@ -0,0 +1,22 @@
 name: update-flake-lock
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
 jobs:
  lockfile:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install Nix
        uses: DeterminateSystems/nix-installer-action@main
      - name: Update flake.lock
        uses: DeterminateSystems/update-flake-lock@main
        with:
          pr-title: "nix: update flake.lock"
          pr-labels: |
            nix
          pr-reviewers: philiptaron,SomeoneSerge
          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@ -0,0 +1,36 @@
 # Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
 name: "Publish a flake to flakestry & flakehub"
 on:
    push:
        tags:
        - "*"
    workflow_dispatch:
        inputs:
            tag:
                description: "The existing tag to publish"
                type: "string"
                required: true
 jobs:
    flakestry-publish:
        runs-on: ubuntu-latest
        permissions:
            id-token: "write"
            contents: "read"
        steps:
            - uses: flakestry/flakestry-publish@main
              with:
                version: "${{ inputs.tag || github.ref_name }}"
    flakehub-publish:
      runs-on: "ubuntu-latest"
      permissions:
        id-token: "write"
        contents: "read"
      steps:
        - uses: "actions/checkout@v4"
          with:
            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
        - uses: "DeterminateSystems/nix-installer-action@main"
        - uses: "DeterminateSystems/flakehub-push@main"
          with:
            visibility: "public"
            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@ -0,0 +1,29 @@
 name: Python check requirements.txt
 on:
  push:
    paths:
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
  pull_request:
    paths:
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
      - 'requirements.txt'
      - 'requirements/*.txt'
 jobs:
  python-check-requirements:
    runs-on: ubuntu-latest
    name: check-requirements
    steps:
      - name: Check out source repository
        uses: actions/checkout@v3
      - name: Set up Python environment
        uses: actions/setup-python@v4
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
        run:  bash scripts/check-requirements.sh nocleanup
--- a/README.md
+++ b/README.md
@ -103,6 +103,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
 - [x] [GPT-2](https://huggingface.co/gpt2)
 **Multimodal models:**
@ -384,16 +385,30 @@ Building the program with BLAS support may lead to some performance improvements
  Check [BLIS.md](docs/BLIS.md) for more information.
- #### Intel MKL
+- #### Intel oneMKL
  - Using manual oneAPI installation:
    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
      ```bash
      mkdir build
      cd build
      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-runtime docker image, only required for manual installation
      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
      cmake --build . --config Release
      ```
-  By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
+  - Using oneAPI docker image:
    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
-  ```bash
+      ```bash
-  mkdir build
+      mkdir build
-  cd build
+      cd build
-  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
-  cmake --build . --config Release
+      cmake --build . --config Release
-  ```
+      ```
  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
 - #### cuBLAS
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -65,4 +65,4 @@ endif()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama build_info)
+target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1394,6 +1394,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -182,6 +182,8 @@ class Model:
            return QwenModel
        if model_architecture == "MixtralForCausalLM":
            return MixtralModel
        if model_architecture == "GPT2LMHeadModel":
            return GPT2Model
        if model_architecture == "PhiForCausalLM":
            return Phi2Model
        if model_architecture == "PlamoForCausalLM":
@ -225,6 +227,8 @@ class Model:
            return gguf.MODEL_ARCH.QWEN
        if arch == "MixtralForCausalLM":
            return gguf.MODEL_ARCH.LLAMA
        if arch == "GPT2LMHeadModel":
            return gguf.MODEL_ARCH.GPT2
        if arch == "PhiForCausalLM":
            return gguf.MODEL_ARCH.PHI2
        if arch == "PlamoForCausalLM":
@ -238,7 +242,7 @@ class Model:
        tokens: list[bytearray] = []
        toktypes: list[int] = []
-        from transformers import AutoTokenizer  # type: ignore[attr-defined]
+        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model)
        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
        assert max(tokenizer.vocab.values()) < vocab_size
@ -852,7 +856,7 @@ class StableLMModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
-        self.gguf_writer.add_name(dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -898,7 +902,7 @@ class QwenModel(Model):
        tokens: list[bytearray] = []
        toktypes: list[int] = []
-        from transformers import AutoTokenizer  # type: ignore[attr-defined]
+        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams["vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
@ -993,6 +997,68 @@ class QwenModel(Model):
            self.gguf_writer.add_tensor(new_name, data)
 class GPT2Model(Model):
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_block_count(self.hparams["n_layer"])
        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_file_type(self.ftype)
    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        for name, data_torch in self.get_tensors():
            # we don't need these
            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
                continue
            if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
                data_torch = data_torch.transpose(1, 0)
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.squeeze().numpy()
            # map tensor names
            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            n_dims = len(data.shape)
            data_dtype = data.dtype
            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
                data = data.astype(np.float32)
            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
            # note: GPT2 output is tied to (same as) wte in original model
            if new_name == "token_embd.weight":
                print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
                self.gguf_writer.add_tensor("output.weight", data)
 class Phi2Model(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layer"]
@ -1119,57 +1185,62 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()
-args = parse_args()
+def main() -> None:
    args = parse_args()
-dir_model = args.model
+    dir_model = args.model
-if args.awq_path:
+    if args.awq_path:
-    sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
+        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-    from awq.apply_awq import add_scale_weights
+        from awq.apply_awq import add_scale_weights
-    tmp_model_path = args.model / "weighted_model"
+        tmp_model_path = args.model / "weighted_model"
-    dir_model = tmp_model_path
+        dir_model = tmp_model_path
-    if tmp_model_path.is_dir():
+        if tmp_model_path.is_dir():
-        print(f"{tmp_model_path} exists as a weighted model.")
+            print(f"{tmp_model_path} exists as a weighted model.")
        else:
            tmp_model_path.mkdir(parents=True, exist_ok=True)
            print("Saving new weighted model ...")
            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
            print(f"Saved weighted model at {tmp_model_path}.")
    if not dir_model.is_dir():
        print(f'Error: {args.model} is not a directory', file=sys.stderr)
        sys.exit(1)
    ftype_map = {
        "f32": gguf.GGMLQuantizationType.F32,
        "f16": gguf.GGMLQuantizationType.F16,
    }
    if args.outfile is not None:
        fname_out = args.outfile
    else:
-        tmp_model_path.mkdir(parents=True, exist_ok=True)
+        # output in the same directory as the model by default
-        print("Saving new weighted model ...")
+        fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
        add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
        print(f"Saved weighted model at {tmp_model_path}.")
-if not dir_model.is_dir():
+    print(f"Loading model: {dir_model.name}")
    print(f'Error: {args.model} is not a directory', file=sys.stderr)
    sys.exit(1)
-ftype_map = {
+    hparams = Model.load_hparams(dir_model)
    "f32": gguf.GGMLQuantizationType.F32,
    "f16": gguf.GGMLQuantizationType.F16,
 }
-if args.outfile is not None:
+    with torch.inference_mode():
-    fname_out = args.outfile
+        model_class = Model.from_model_architecture(hparams["architectures"][0])
-else:
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
-print(f"Loading model: {dir_model.name}")
+        print("Set model parameters")
        model_instance.set_gguf_parameters()
-hparams = Model.load_hparams(dir_model)
+        print("Set model tokenizer")
        model_instance.set_vocab()
-with torch.inference_mode():
+        if args.vocab_only:
-    model_class = Model.from_model_architecture(hparams["architectures"][0])
+            print(f"Exporting model vocab to '{fname_out}'")
-    model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+            model_instance.write_vocab()
        else:
            print(f"Exporting model to '{fname_out}'")
            model_instance.write()
-    print("Set model parameters")
+        print(f"Model successfully exported to '{fname_out}'")
    model_instance.set_gguf_parameters()
    print("Set model tokenizer")
    model_instance.set_vocab()
-    if args.vocab_only:
+if __name__ == '__main__':
-        print(f"Exporting model vocab to '{fname_out}'")
+    main()
        model_instance.write_vocab()
    else:
        print(f"Exporting model to '{fname_out}'")
        model_instance.write()
    print(f"Model successfully exported to '{fname_out}'")
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -47,95 +47,96 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
    fout.seek((fout.tell() + 31) & -32)
-if len(sys.argv) < 2:
+if __name__ == '__main__':
-    print(f"Usage: python {sys.argv[0]} <path> [arch]")
+    if len(sys.argv) < 2:
-    print(
+        print(f"Usage: python {sys.argv[0]} <path> [arch]")
-        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+        print(
-    )
+            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
-    print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
+        )
-    sys.exit(1)
+        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
        sys.exit(1)
-input_json = os.path.join(sys.argv[1], "adapter_config.json")
+    input_json = os.path.join(sys.argv[1], "adapter_config.json")
-input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
-output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
-model = torch.load(input_model, map_location="cpu")
+    model = torch.load(input_model, map_location="cpu")
-arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
-if arch_name not in gguf.MODEL_ARCH_NAMES.values():
+    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
-    print(f"Error: unsupported architecture {arch_name}")
+        print(f"Error: unsupported architecture {arch_name}")
-    sys.exit(1)
+        sys.exit(1)
-arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
+    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
-name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
+    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
-with open(input_json, "r") as f:
+    with open(input_json, "r") as f:
-    params = json.load(f)
+        params = json.load(f)
-if params["peft_type"] != "LORA":
+    if params["peft_type"] != "LORA":
-    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
-    sys.exit(1)
+        sys.exit(1)
-if params["fan_in_fan_out"] is True:
+    if params["fan_in_fan_out"] is True:
-    print("Error: param fan_in_fan_out is not supported")
+        print("Error: param fan_in_fan_out is not supported")
-    sys.exit(1)
+        sys.exit(1)
-if params["bias"] is not None and params["bias"] != "none":
+    if params["bias"] is not None and params["bias"] != "none":
-    print("Error: param bias is not supported")
+        print("Error: param bias is not supported")
-    sys.exit(1)
+        sys.exit(1)
-# TODO: these seem to be layers that have been trained but without lora.
+    # TODO: these seem to be layers that have been trained but without lora.
-# doesn't seem widely used but eventually should be supported
+    # doesn't seem widely used but eventually should be supported
-if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
-    print("Error: param modules_to_save is not supported")
+        print("Error: param modules_to_save is not supported")
-    sys.exit(1)
+        sys.exit(1)
-with open(output_path, "wb") as fout:
+    with open(output_path, "wb") as fout:
-    fout.truncate()
+        fout.truncate()
-    write_file_header(fout, params)
+        write_file_header(fout, params)
-    for k, v in model.items():
+        for k, v in model.items():
-        orig_k = k
+            orig_k = k
-        if k.endswith(".default.weight"):
+            if k.endswith(".default.weight"):
-            k = k.replace(".default.weight", ".weight")
+                k = k.replace(".default.weight", ".weight")
-        if k in ["llama_proj.weight", "llama_proj.bias"]:
+            if k in ["llama_proj.weight", "llama_proj.bias"]:
-            continue
+                continue
-        if k.endswith("lora_A.weight"):
+            if k.endswith("lora_A.weight"):
-            if v.dtype != torch.float16 and v.dtype != torch.float32:
+                if v.dtype != torch.float16 and v.dtype != torch.float32:
                    v = v.float()
                v = v.T
            else:
                v = v.float()
            v = v.T
        else:
            v = v.float()
-        t = v.detach().numpy()
+            t = v.detach().numpy()
-        prefix = "base_model.model."
+            prefix = "base_model.model."
-        if k.startswith(prefix):
+            if k.startswith(prefix):
-            k = k[len(prefix) :]
+                k = k[len(prefix) :]
-        lora_suffixes = (".lora_A.weight", ".lora_B.weight")
+            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
-        if k.endswith(lora_suffixes):
+            if k.endswith(lora_suffixes):
-            suffix = k[-len(lora_suffixes[0]):]
+                suffix = k[-len(lora_suffixes[0]):]
-            k = k[: -len(lora_suffixes[0])]
+                k = k[: -len(lora_suffixes[0])]
-        else:
+            else:
-            print(f"Error: unrecognized tensor name {orig_k}")
+                print(f"Error: unrecognized tensor name {orig_k}")
-            sys.exit(1)
+                sys.exit(1)
-        tname = name_map.get_name(k)
+            tname = name_map.get_name(k)
-        if tname is None:
+            if tname is None:
-            print(f"Error: could not map tensor name {orig_k}")
+                print(f"Error: could not map tensor name {orig_k}")
-            print(" Note: the arch parameter must be specified if the model is not llama")
+                print(" Note: the arch parameter must be specified if the model is not llama")
-            sys.exit(1)
+                sys.exit(1)
-        if suffix == ".lora_A.weight":
+            if suffix == ".lora_A.weight":
-            tname += ".weight.loraA"
+                tname += ".weight.loraA"
-        elif suffix == ".lora_B.weight":
+            elif suffix == ".lora_B.weight":
-            tname += ".weight.loraB"
+                tname += ".weight.loraB"
-        else:
+            else:
-            assert False
+                assert False
-        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
-        write_tensor_header(fout, tname, t.shape, t.dtype)
+            write_tensor_header(fout, tname, t.shape, t.dtype)
-        t.tofile(fout)
+            t.tofile(fout)
-print(f"Converted {input_json} and {input_model} to {output_path}")
+    print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 import torch
 import os
 from pprint import pprint
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -1,5 +1,7 @@
 import Foundation
 // To use this in your own project, add llama.cpp as a swift package dependency
 // and uncomment this import line.
 // import llama
 enum LlamaError: Error {
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -4,6 +4,7 @@ import Foundation
 class LlamaState: ObservableObject {
    @Published var messageLog = ""
    @Published var cacheCleared = false
    let NS_PER_S = 1_000_000_000.0
    private var llamaContext: LlamaContext?
    private var defaultModelUrl: URL? {
@ -20,12 +21,12 @@ class LlamaState: ObservableObject {
    }
    func loadModel(modelUrl: URL?) throws {
        messageLog += "Loading model...\n"
        if let modelUrl {
            messageLog += "Loading model...\n"
            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
        } else {
-            messageLog += "Could not locate model\n"
+            messageLog += "Load a model from the list below\n"
        }
    }
@ -34,15 +35,29 @@ class LlamaState: ObservableObject {
            return
        }
        let t_start = DispatchTime.now().uptimeNanoseconds
        await llamaContext.completion_init(text: text)
        let t_heat_end = DispatchTime.now().uptimeNanoseconds
        let t_heat = Double(t_heat_end - t_start) / NS_PER_S
        messageLog += "\(text)"
-        while await llamaContext.n_cur <= llamaContext.n_len {
+        while await llamaContext.n_cur < llamaContext.n_len {
            let result = await llamaContext.completion_loop()
            messageLog += "\(result)"
        }
        let t_end = DispatchTime.now().uptimeNanoseconds
        let t_generation = Double(t_end - t_heat_end) / NS_PER_S
        let tokens_per_second = Double(await llamaContext.n_len) / t_generation
        await llamaContext.clear()
-        messageLog += "\n\ndone\n"
+        messageLog += """
            \n
            Done
            Heat up took \(t_heat)s
            Generated \(tokens_per_second) t/s\n
            """
    }
    func bench() async {
@ -56,10 +71,10 @@ class LlamaState: ObservableObject {
        messageLog += await llamaContext.model_info() + "\n"
        let t_start = DispatchTime.now().uptimeNanoseconds
-        await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
+        let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
        let t_end = DispatchTime.now().uptimeNanoseconds
-        let t_heat = Double(t_end - t_start) / 1_000_000_000.0
+        let t_heat = Double(t_end - t_start) / NS_PER_S
        messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
        // if more than 5 seconds, then we're probably running on a slow device
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@ -42,46 +42,27 @@ struct ContentView: View {
                Button("Send") {
                    sendText()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Bench") {
                    bench()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Clear") {
                    clear()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Copy") {
                    UIPasteboard.general.string = llamaState.messageLog
                }
-                .padding(8)
+            }.buttonStyle(.bordered)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
            }
-            VStack {
+            VStack(alignment: .leading) {
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
                )
                .font(.system(size: 12))
                .padding(.top, 4)
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
@ -89,7 +70,6 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
                )
                .font(.system(size: 12))
                DownloadButton(
                    llamaState: llamaState,
@ -97,8 +77,6 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
                    filename: "tinyllama-1.1b-f16.gguf"
                )
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
@ -106,7 +84,6 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
                    filename: "phi-2-q4_0.gguf"
                )
                .font(.system(size: 12))
                DownloadButton(
                    llamaState: llamaState,
@ -114,8 +91,6 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
                    filename: "phi-2-q8_0.gguf"
                )
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
@ -123,15 +98,15 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
                    filename: "mistral-7b-v0.1.Q4_0.gguf"
                )
                .font(.system(size: 12))
                Button("Clear downloaded models") {
                    ContentView.cleanupModelCaches()
                    llamaState.cacheCleared = true
                }
                .padding(8)
                .font(.system(size: 12))
            }
            .padding(.top, 4)
            .font(.system(size: 12))
            .frame(maxWidth: .infinity, alignment: .leading)
        }
        .padding()
    }
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
@ -93,7 +93,7 @@ struct DownloadButton: View {
                        print("Error: \(err.localizedDescription)")
                    }
                }) {
-                    Text("\(modelName) (Downloaded)")
+                    Text("Load \(modelName)")
                }
            } else {
                Text("Unknown status")
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -24,7 +24,8 @@ endif()
 if (NOT MSVC)
    target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
-    endif()
+endif()
 if(TARGET BUILD_INFO)
    add_dependencies(llava BUILD_INFO)
 endif()
@ -32,5 +33,5 @@ endif()
 set(TARGET llava-cli)
 add_executable(llava-cli llava-cli.cpp)
 install(TARGETS llava-cli RUNTIME)
-target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(llava PRIVATE cxx_std_11)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -16,12 +16,19 @@
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #endif
 #ifdef GGML_USE_METAL
 #include "ggml-metal.h"
 #endif
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 #define CLIP_DEBUG
 static std::string format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@ -139,6 +146,27 @@ static std::string get_ftype(int ftype) {
    }
 }
 //
 // image data
 //
 // RGB uint8 image
 struct clip_image_u8 {
    int nx;
    int ny;
    std::vector<uint8_t> buf;
 };
 // RGB float32 image (NHWC)
 // Memory layout: RGBRGBRGB...
 struct clip_image_f32 {
    int nx;
    int ny;
    std::vector<float> buf;
 };
 //
 // clip layers
 //
@ -196,39 +224,31 @@ struct clip_vision_model {
    struct ggml_tensor * mm_2_b;
 };
 // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
 struct clip_buffer {
    uint8_t * data = NULL;
    size_t size = 0;
    void resize(size_t size) {
        delete[] data;
        data = new uint8_t[size];
        this->size = size;
    }
    ~clip_buffer() { delete[] data; }
 };
 struct clip_ctx {
-    bool has_text_encoder = false;
+    bool has_text_encoder    = false;
-    bool has_vision_encoder = false;
+    bool has_vision_encoder  = false;
    bool has_llava_projector = false;
    struct clip_vision_model vision_model;
    float image_mean[3];
    float image_std[3];
    bool use_gelu = false;
    int32_t ftype = 1;
-    struct ggml_context * ctx;
+
    struct gguf_context * ctx_gguf;
    struct ggml_context * ctx_data;
    std::vector<uint8_t> buf_compute_meta;
    // memory buffers to evaluate the model
-    clip_buffer buf_compute;
+    ggml_backend_buffer_t params_buffer = NULL;
-    clip_buffer buf_alloc;
+    ggml_backend_buffer_t compute_buffer = NULL;
-    ggml_allocr * alloc = NULL;
+    ggml_backend_t backend = NULL;
    ggml_allocr * compute_alloc = NULL;
 };
-static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return nullptr;
@ -249,28 +269,24 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
    //const int projection_dim = hparams.projection_dim;
    const float eps = hparams.eps;
    int batch_size = imgs->size;
-    if(ctx->has_llava_projector) {
+    if (ctx->has_llava_projector) {
        GGML_ASSERT(batch_size == 1);
    }
    const auto & buf_compute = ctx->buf_compute;
    struct ggml_init_params params = {
-        /*.mem_size =*/ buf_compute.size,
+        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
-        /*.mem_buffer =*/ buf_compute.data,
+        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
-        /*.no_alloc =*/ false,
+        /*.no_alloc   =*/ true,
    };
    params.no_alloc = true;
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
-    ggml_allocr_alloc(ctx->alloc, inp_raw);
+    ggml_allocr_alloc(ctx->compute_alloc, inp_raw);
-    if (!ggml_allocr_is_measure(ctx->alloc)) {
+    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
-        float * data = (float *)ggml_get_data(inp_raw);
+        float * data = (float *)malloc(ggml_nbytes(inp_raw));
        for (size_t i = 0; i < imgs->size; i++) {
            const int nx = imgs->data[i].nx;
@ -283,12 +299,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
                for (int k = 0; k < 3; k++) {
                    for (int y = 0; y < ny; y++) {
                        for (int x = 0; x < nx; x++) {
-                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
                        }
                    }
                }
            }
        }
        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
        free(data);
    }
    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
@ -298,36 +316,39 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
    // concat class_embeddings and patch_embeddings
    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-    ggml_allocr_alloc(ctx->alloc, embeddings);
+    ggml_allocr_alloc(ctx->compute_alloc, embeddings);
-    if (!ggml_allocr_is_measure(ctx->alloc)) {
+    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
-        ggml_set_zero(embeddings);
+        void* zero_mem = malloc(ggml_nbytes(embeddings));
        memset(zero_mem, 0, ggml_nbytes(embeddings));
        ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
        free(zero_mem);
    }
-    struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
+    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-    ggml_allocr_alloc(ctx->alloc, temp);
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-    embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1],
+    embeddings = ggml_acc(ctx0, embeddings, inp,
-                          embeddings->nb[2], embeddings->nb[3], 0);
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
    embeddings =
        ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
-    ggml_allocr_alloc(ctx->alloc, positions);
+    ggml_allocr_alloc(ctx->compute_alloc, positions);
-    if (!ggml_allocr_is_measure(ctx->alloc)) {
+    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
        int* positions_data = (int*)malloc(ggml_nbytes(positions));
        for (int i = 0; i < num_positions; i++) {
-            ggml_set_i32_1d(positions, i, i);
+            positions_data[i] = i;
        }
        ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
        free(positions_data);
    }
    embeddings =
-        ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));
+        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
    // pre-layernorm
    {
        embeddings = ggml_norm(ctx0, embeddings, eps);
-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings),
+        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
                              ggml_repeat(ctx0, model.pre_ln_b, embeddings));
    }
    // loop over layers
@ -340,15 +361,15 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        {
            cur = ggml_norm(ctx0, cur, eps);
-            cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur),
+            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
-                           ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                           model.layers[il].ln_1_b);
        }
        // self-attention
        {
            struct ggml_tensor * Q =
-                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur));
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
@ -356,14 +377,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
            struct ggml_tensor * K =
-                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), ggml_mul_mat(ctx0, model.layers[il].k_w, cur));
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
            struct ggml_tensor * V =
-                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), ggml_mul_mat(ctx0, model.layers[il].v_w, cur));
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
@ -379,7 +400,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        }
        // attention output
-        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), ggml_mul_mat(ctx0, model.layers[il].o_w, cur));
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, embeddings);
@ -390,12 +411,11 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        {
            cur = ggml_norm(ctx0, cur, eps);
-            cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur),
+            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
                           ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
        }
        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
        if (ctx->use_gelu) {
            cur = ggml_gelu_inplace(ctx0, cur);
@ -404,7 +424,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        }
        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
        // residual 2
        cur = ggml_add(ctx0, embeddings, cur);
@ -417,23 +437,26 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
-        ggml_allocr_alloc(ctx->alloc, patches);
+        ggml_allocr_alloc(ctx->compute_alloc, patches);
-        if (!ggml_allocr_is_measure(ctx->alloc)) {
+        if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
-            for (int i = 0; i < num_patches; ++i) {
+            int* patches_data = (int*)malloc(ggml_nbytes(patches));
-                ggml_set_i32_1d(patches, i, i+1);
+            for (int i = 0; i < num_patches; i++) {
                patches_data[i] = i + 1;
            }
            ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
            free(patches_data);
        }
        embeddings = ggml_get_rows(ctx0, embeddings, patches);
        // mm projection 0
        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_0_b, embeddings), embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
        embeddings = ggml_gelu(ctx0, embeddings);
        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
    }
    // build the graph
@ -446,7 +469,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
 // read and create ggml_context containing the tensors and their data
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    struct ggml_context * meta = NULL;
    struct gguf_init_params params = {
@ -479,7 +501,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
        printf("\n");
    }
-
+    const int n_tensors = gguf_get_n_tensors(ctx);
    // kv
    if (verbosity >= 3) {
        const int n_kv = gguf_get_n_kv(ctx);
@ -493,27 +515,38 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    }
    // data
-    size_t ctx_size = 0;
+    size_t buffer_size = 0;
    {
        const int n_tensors = gguf_get_n_tensors(ctx);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
            ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
            size_t tensor_size = ggml_nbytes(cur);
-            size_t padded_size = ggml_nbytes_pad(cur);
+            buffer_size += tensor_size;
            ctx_size += padded_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
+                       ggml_n_dims(cur), cur->name, tensor_size, offset);
            }
        }
    }
    buffer_size += n_tensors * 128 /* CLIP PADDING */;
    clip_ctx * new_clip = new clip_ctx;
 #ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
 #endif
 #ifdef GGML_USE_METAL
    new_clip->backend = ggml_backend_metal_init();
    printf("%s: CLIP using Metal backend\n", __func__);
 #endif
    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
        printf("%s: CLIP using CPU backend\n", __func__);
    }
    // model size and capabilities
    {
@ -539,21 +572,24 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            printf("%s: model size:     %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0));
+            printf("%s: model size:     %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0);
            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }
    printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors);
    // load tensors
    {
        std::vector<uint8_t> read_buf;
        struct ggml_init_params params = {
-            /*.mem_size =*/ ctx_size,
+            /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
            /*.mem_buffer =*/ NULL,
-            /*.no_alloc =*/ false,
+            /*.no_alloc =*/ true,
        };
-        new_clip->ctx = ggml_init(params);
+        new_clip->ctx_data = ggml_init(params);
-        if (!new_clip->ctx) {
+        if (!new_clip->ctx_data) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            return nullptr;
@ -566,13 +602,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            return nullptr;
        }
-        const int n_tensors = gguf_get_n_tensors(ctx);
+        // add tensors to context
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor * t = ggml_get_tensor(meta, name);
-            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
+            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
            ggml_set_name(cur, name);
        }
        // alloc memory and offload data
        new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size);
        ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
            ggml_allocr_alloc(alloc, cur);
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
@ -580,10 +624,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                clip_free(new_clip);
                return nullptr;
            }
-
+            int num_bytes = ggml_nbytes(cur);
-            fin.read(reinterpret_cast<char *>(cur->data), ggml_nbytes(t));
+            if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
                // for the CPU and Metal backend, we can read directly into the tensor
                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
            } else {
                // read into a temporary buffer first, then copy to device memory
                read_buf.resize(num_bytes);
                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
            }
        }
-
+        ggml_allocr_free(alloc);
        fin.close();
    }
@ -592,20 +644,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        // load vision model
        auto & vision_model = new_clip->vision_model;
        auto & hparams = vision_model.hparams;
-        hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
+        hparams.hidden_size    = get_u32(ctx, format(KEY_N_EMBD, "vision"));
-        hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
+        hparams.n_head         = get_u32(ctx, format(KEY_N_HEAD, "vision"));
        hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
-        hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
+        hparams.n_layer        = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
-        hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
+        hparams.image_size     = get_u32(ctx, KEY_IMAGE_SIZE);
-        hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
+        hparams.patch_size     = get_u32(ctx, KEY_PATCH_SIZE);
        hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
-        hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+        hparams.eps            = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
        int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
-        int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
+        int idx_std  = get_key_idx(ctx, KEY_IMAGE_STD);
        for (int i = 0; i < 3; ++i) {
            new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
-            new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
+            new_clip->image_std[i]  = *((const float *)gguf_get_arr_data(ctx, idx_std));
        }
        if (verbosity >= 2) {
@ -619,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            printf("v_n_layer          %d\n", hparams.n_layer);
        }
-        vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
+        vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
-        vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
+        vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-        vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
+        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
+        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-        vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
+        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
+        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
-        vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
+        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-        vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
+        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-        vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
+        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
            auto & layer = vision_model.layers[il];
-            layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
+            layer.k_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "weight"));
-            layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
+            layer.q_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "weight"));
-            layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
+            layer.v_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "weight"));
-            layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
+            layer.o_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
-            layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
+            layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "weight"));
-            layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
+            layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "weight"));
-            layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
+            layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "weight"));
-            layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
+            layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "weight"));
-            layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
+            layer.k_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "bias"));
-            layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
+            layer.q_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "bias"));
-            layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
+            layer.v_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "bias"));
-            layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
+            layer.o_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
-            layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
+            layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "bias"));
-            layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
+            layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "bias"));
-            layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
+            layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "bias"));
-            layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
+            layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "bias"));
        }
    }
@ -655,45 +707,45 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    new_clip->ctx_gguf = ctx;
-// measure mem requirement and allocate
+    // measure mem requirement and allocate
    {
-        static const size_t tensor_alignment = 32;
+        new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
-        new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
+        new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
        new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
        clip_image_f32_batch batch;
        batch.size = 1;
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
-        size_t alloc_size = ggml_allocr_alloc_graph(new_clip->alloc, gf) + tensor_alignment;
+        size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf);
-        ggml_allocr_free(new_clip->alloc);
+        ggml_allocr_free(new_clip->compute_alloc);
-        new_clip->buf_alloc.resize(alloc_size);
+        new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size);
-        new_clip->alloc = ggml_allocr_new(new_clip->buf_alloc.data, new_clip->buf_alloc.size, tensor_alignment);
+        new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer);
-        printf("%s: total allocated memory: %.2f MB\n", __func__, (new_clip->buf_compute.size + alloc_size)/1024.0/1024.0);
+        printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }
    return new_clip;
 }
-clip_image_u8 * make_clip_image_u8() {
+struct clip_image_u8 * clip_image_u8_init() {
-    auto img = new clip_image_u8();
+    return new clip_image_u8();
    return img;
 }
 clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }
-void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
+struct clip_image_f32 * clip_image_f32_init() {
-void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
+    return new clip_image_f32();
 }
 void clip_image_u8_free (struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
 static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
    img->nx = nx;
    img->ny = ny;
-    img->size = nx * ny * 3;
+    img->buf.resize(3 * nx * ny);
-    img->data = new uint8_t[img->size]();
+    memcpy(img->buf.data(), data, img->buf.size());
    memcpy(img->data, data, img->size);
 }
 bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
-    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
+    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
        return false;
@ -705,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
 bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
    int nx, ny, nc;
-    auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
+    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
        return false;
@ -717,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
 // normalize: x = (x - mean) / std
 // TODO: implement bicubic interpolation instead of linear.
-bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@ -726,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
-    clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
+    clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
    if (pad2square && img->nx != img->ny) {
        int longer_side = std::max(img->nx, img->ny);
        temp->nx = longer_side;
        temp->ny = longer_side;
-        temp->size = 3 * longer_side * longer_side;
+        temp->buf.resize(3 * longer_side * longer_side);
-        temp->data = new uint8_t[temp->size]();
+        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
        uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
        // fill with background color
-        for (size_t i = 0; i < temp->size; i++) {
+        for (size_t i = 0; i < temp->buf.size(); i++) {
-            temp->data[i] = bc[i % 3];
+            temp->buf[i] = bc[i % 3];
        }
        // copy from the input image
@ -745,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
            for (int x = 0; x < img->nx; x++) {
                const int i = 3 * (y * img->nx + x);
                const int j = 3 * (y * temp->nx + x);
-                temp->data[j] = img->data[i];
+                temp->buf[j]   = img->buf[i];
-                temp->data[j+1] = img->data[i+1];
+                temp->buf[j+1] = img->buf[i+1];
-                temp->data[j+2] = img->data[i+2];
+                temp->buf[j+2] = img->buf[i+2];
            }
        }
    } else {
-        temp->nx   = img->nx;
+        temp->nx = img->nx;
-        temp->ny   = img->ny;
+        temp->ny = img->ny;
-        temp->size = img->size;
+        temp->buf.resize(img->buf.size());
-        temp->data = new uint8_t[temp->size]();
+        memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
        memcpy(&temp->data[0], &img->data[0], temp->size); // copy
    }
    const int nx = temp->nx;
@ -766,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
    res->nx = nx2;
    res->ny = ny2;
-    res->size = 3 * nx2 * ny2;
+    res->buf.resize(3 * nx2 * ny2);
    res->data = new float[res->size]();
    const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;
@ -798,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
                const int j10 = 3 * (y1 * nx + x0) + c;
                const int j11 = 3 * (y1 * nx + x1) + c;
-                const float v00 = temp->data[j00];
+                const float v00 = temp->buf[j00];
-                const float v01 = temp->data[j01];
+                const float v01 = temp->buf[j01];
-                const float v10 = temp->data[j10];
+                const float v10 = temp->buf[j10];
-                const float v11 = temp->data[j11];
+                const float v11 = temp->buf[j11];
                const float v0 = v00 * (1.0f - dx) + v01 * dx;
                const float v1 = v10 * (1.0f - dx) + v11 * dx;
@ -812,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
                const int i = 3 * (y * nx3 + x) + c;
-                res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
+                res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
            }
        }
    }
@ -822,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
 }
 void clip_free(clip_ctx * ctx) {
-    ggml_free(ctx->ctx);
+    ggml_free(ctx->ctx_data);
    gguf_free(ctx->ctx_gguf);
    delete ctx;
 }
-bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@ -839,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }
-bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@ -852,29 +901,29 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
    }
    // reset alloc buffer to clean the memory from previous invocations
-    ggml_allocr_reset(ctx->alloc);
+    ggml_allocr_reset(ctx->compute_alloc);
    // build the inference graph
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
-    ggml_allocr_alloc_graph(ctx->alloc, gf);
+    ggml_allocr_alloc_graph(ctx->compute_alloc, gf);
-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
+    if (ggml_backend_is_cpu(ctx->backend)) {
-    if (plan.work_size > 0) {
+        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
        plan.work_data = (uint8_t *)malloc(plan.work_size);
    }
-    ggml_graph_compute(gf, &plan);
+#ifdef GGML_USE_METAL
    if (ggml_backend_is_metal(ctx->backend)) {
        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
    }
 #endif
    ggml_backend_graph_compute(ctx->backend, gf);
    // the last node is the embedding tensor
-struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
    // copy the embeddings to the location passed by the user
-    memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));
+    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
    if (plan.work_size > 0) {
        free(plan.work_data);
    }
    return true;
 }
@ -883,31 +932,32 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    ggml_type type = GGML_TYPE_Q4_1;
    switch (itype) {
-    case 2:
+        case 2:
-        type = GGML_TYPE_Q4_0;
+            type = GGML_TYPE_Q4_0;
-        break;
+            break;
-    case 3:
+        case 3:
-        type = GGML_TYPE_Q4_1;
+            type = GGML_TYPE_Q4_1;
-        break;
+            break;
-    case 6:
+        case 6:
-        type = GGML_TYPE_Q5_0;
+            type = GGML_TYPE_Q5_0;
-        break;
+            break;
-    case 7:
+        case 7:
-        type = GGML_TYPE_Q5_1;
+            type = GGML_TYPE_Q5_1;
-        break;
+            break;
-    case 8:
+        case 8:
-        type = GGML_TYPE_Q8_0;
+            type = GGML_TYPE_Q8_0;
-        break;
+            break;
-    default:
+        default:
-        fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
+            fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
-        return false;
+            return false;
    };
-    auto ctx_clip = clip_model_load(fname_inp, 2);
+    auto * ctx_clip = clip_model_load(fname_inp, 2);
    const auto & ctx_src = ctx_clip->ctx_gguf;
    const auto & ctx_data = ctx_clip->ctx;
-    auto ctx_out = gguf_init_empty();
+    const auto & ctx_src = ctx_clip->ctx_gguf;
    const auto & ctx_data = ctx_clip->ctx_data;
    auto * ctx_out = gguf_init_empty();
    gguf_set_kv(ctx_out, ctx_src);
    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
    gguf_set_val_u32(ctx_out, "general.file_type", itype);
@ -1045,8 +1095,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    gguf_free(ctx_out);
    {
-        printf("%s: original size  = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        printf("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        printf("%s: quantized size  = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
        int64_t sum_all = 0;
        for (size_t i = 0; i < hist_all.size(); ++i) {
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -35,31 +35,14 @@ struct clip_vision_hparams {
    float eps;
 };
-/** load mmproj model */
+CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
-CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+
 /** free mmproj model */
 CLIP_API void clip_free(struct clip_ctx * ctx);
-size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 int clip_n_patches(const struct clip_ctx * ctx);
 int clip_n_mmproj_embd(const struct clip_ctx * ctx);
-// RGB uint8 image
+CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
-struct clip_image_u8 {
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
    int nx;
    int ny;
    uint8_t * data = NULL;
    size_t size;
 };
 // RGB float32 image (NHWC)
 // Memory layout: RGBRGBRGB...
 struct clip_image_f32 {
    int nx;
    int ny;
    float * data = NULL;
    size_t size;
 };
 struct clip_image_u8_batch {
    struct clip_image_u8 * data;
@ -71,21 +54,22 @@ struct clip_image_f32_batch {
    size_t size;
 };
-struct clip_image_u8 * make_clip_image_u8();
+CLIP_API struct clip_image_u8  * clip_image_u8_init ();
-struct clip_image_f32 * make_clip_image_f32();
+CLIP_API struct clip_image_f32 * clip_image_f32_init();
-CLIP_API void clip_image_u8_free(clip_image_u8 * img);
+
-CLIP_API void clip_image_f32_free(clip_image_f32 * img);
+CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
+CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
-bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
+CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
-bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
+CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
                             float * vec);
 bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
 #ifdef __cplusplus
 }
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -39,73 +39,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
    return true;
 }
-// TODO: use common/sampling.h
+static const char * sample(struct llama_sampling_context * ctx_sampling,
-static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
+                           struct llama_context * ctx_llama,
-    auto & sparams = params.sparams;
+                           int * n_past) {
-
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    // out of user input, sample next token
+    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    const float   temp      = sparams.temp;
    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
    const float   top_p     = sparams.top_p;
    const float   tfs_z     = sparams.tfs_z;
    const float   typical_p = sparams.typical_p;
    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
    // const float   repeat_penalty  = sparams.repeat_penalty;
    // const float   alpha_presence  = sparams.presence_penalty;
    // const float   alpha_frequency = sparams.frequency_penalty;
    const int     mirostat     = sparams.mirostat;
    const float   mirostat_tau = sparams.mirostat_tau;
    const float   mirostat_eta = sparams.mirostat_eta;
    // const bool    penalize_nl     = sparams.penalize_nl;
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx_llama);
        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
        // Apply params.logit_bias map
        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        if (temp <= 0) {
              // Greedy sampling
            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
        } else {
            if (mirostat == 1) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                const  int mirostat_m    = 100;
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
            } else if (mirostat == 2) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
            } else {
                  // Temperature sampling
                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token(ctx_llama, &candidates_p);
            }
        }
    }
    return id;
 }
 static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
    int id = sample_id(ctx_llama, params);
    static std::string ret;
    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
        ret = "</s>";
@ -174,8 +112,8 @@ struct llava_context {
 };
 static void show_additional_info(int /*argc*/, char ** argv) {
-    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    fprintf(stderr, "  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
@ -185,7 +123,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
    auto prompt = params->prompt;
    if (prompt_contains_image(prompt)) {
        if (!params->image.empty()) {
-            printf("using base64 encoded image instead of command line image path\n");
+            fprintf(stderr, "using base64 encoded image instead of command line image path\n");
        }
        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
        if (!embed) {
@ -217,16 +155,19 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    // generate the response
-    printf("\n");
+    fprintf(stderr, "\n");
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
+        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
        if (strcmp(tmp, "</s>") == 0) break;
        printf("%s", tmp);
        fflush(stdout);
    }
    llama_sampling_free(ctx_sampling);
    printf("\n");
 }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -10,7 +10,7 @@
 #include "base64.hpp"
 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    clip_image_f32 * img_res = make_clip_image_f32();
+    clip_image_f32 * img_res = clip_image_f32_init();
    if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
        clip_image_f32_free(img_res);
@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
 }
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
-    clip_image_u8 * img = make_clip_image_u8();
+    clip_image_u8 * img = clip_image_u8_init();
    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
        clip_image_u8_free(img);
        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@ -7,28 +7,13 @@ find_package(Llama 0.0.1 REQUIRED)
 # Bake common functionality in with target. Because applications
 # using the relocatable Llama package should be outside of the
 # source tree, main-cmake-pkg pretends the dependencies are built-in.
 set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
-add_library(common OBJECT
+add_library(common OBJECT)
-    ${_common_path}/common.h
+file(GLOB _common_files
-    ${_common_path}/common.cpp
+    "${_common_path}/*.h"
-    ${_common_path}/console.h
+    "${_common_path}/*.cpp"
-    ${_common_path}/console.cpp
+)
-    ${_common_path}/grammar-parser.h
+target_sources(common PRIVATE ${_common_files})
    ${_common_path}/grammar-parser.cpp
    ${_common_path}/sampling.h
    ${_common_path}/sampling.cpp
    )
 # WARNING: because build-info.h is auto-generated, it will only
 # be available after the user has built the llama.cpp sources.
 #
 configure_file(${_common_path}/../build-info.h
    ${CMAKE_CURRENT_BINARY_DIR}/build-info.h
    COPYONLY)
 target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
    ${CMAKE_CURRENT_BINARY_DIR})
 # If the common project was part of "main-cmake-pkg" the transient
 # defines would automatically be attached. Because the common func-
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -166,7 +166,7 @@ node index.js
    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
    *Result JSON:*
@ -224,6 +224,8 @@ node index.js
    `content`: Set the text to process.
    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
 -   **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
    *Options:*
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -25,6 +25,7 @@
 #include <thread>
 #include <mutex>
 #include <chrono>
 #include <condition_variable>
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@ -81,7 +82,7 @@ static inline bool is_base64(uint8_t c)
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
-static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
+static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 {
    int i = 0;
    int j = 0;
@ -208,10 +209,10 @@ struct slot_image
    int32_t id;
    bool request_encode_image = false;
-    float* image_embedding = nullptr;
+    float * image_embedding = nullptr;
    int32_t image_tokens = 0;
-    clip_image_u8 img_data;
+    clip_image_u8 * img_data;
    std::string prefix_prompt; // before of this image
 };
@ -433,15 +434,16 @@ struct llama_client_slot
        generated_token_probs.clear();
-        for (slot_image &img : images)
+        for (slot_image & img : images)
        {
            free(img.image_embedding);
-            delete[] img.img_data.data;
+            if (img.img_data) {
                clip_image_u8_free(img.img_data);
            }
            img.prefix_prompt = "";
        }
        images.clear();
        // llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
    }
    bool has_budget(gpt_params &global_params) {
@ -542,7 +544,9 @@ struct llama_server_context
    std::vector<task_result> queue_results;
    std::vector<task_multi>  queue_multitasks;
    std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
    std::condition_variable condition_tasks;
    std::mutex mutex_results;
    std::condition_variable condition_results;
    ~llama_server_context()
    {
@ -849,24 +853,17 @@ struct llama_server_context
            {
                for (const auto &img : *images_data)
                {
-                    std::string data_b64 = img["data"].get<std::string>();
+                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
                    slot_image img_sl;
                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-                    int width, height, channels;
+                    img_sl.img_data = clip_image_u8_init();
-                    std::vector<uint8_t> image_buffer = base64_decode(data_b64);
+                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
-                    data_b64.clear();
+                    {
                    auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
                    if (!data) {
                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
                        return false;
                    }
-                    LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
+                    LOG_TEE("slot %i - loaded image\n", slot->id);
                    img_sl.img_data.nx = width;
                    img_sl.img_data.ny = height;
                    img_sl.img_data.size = width * height * 3;
                    img_sl.img_data.data = new uint8_t[width * height * 3]();
                    memcpy(img_sl.img_data.data, data, width * height * 3);
                    stbi_image_free(data);
                    img_sl.request_encode_image = true;
                    slot->images.push_back(img_sl);
                }
@ -921,6 +918,7 @@ struct llama_server_context
            llama_sampling_free(slot->ctx_sampling);
        }
        slot->ctx_sampling = llama_sampling_init(slot->sparams);
        llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;
        all_slots_are_idle = false;
@ -1140,8 +1138,8 @@ struct llama_server_context
            {
                continue;
            }
-            clip_image_f32 img_res;
+            clip_image_f32 * img_res = clip_image_f32_init();
-            if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
+            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
            {
                LOG_TEE("Error processing the given image");
                clip_free(clp_ctx);
@ -1156,11 +1154,12 @@ struct llama_server_context
                return false;
            }
            LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
-            if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
+            if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
            {
                LOG_TEE("Unable to encode image\n");
                return false;
            }
            clip_image_f32_free(img_res);
            img.request_encode_image = false;
        }
@ -1169,7 +1168,7 @@ struct llama_server_context
    void send_error(task_server& task, std::string error)
    {
-        std::lock_guard<std::mutex> lock(mutex_results);
+        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@ -1177,6 +1176,7 @@ struct llama_server_context
        res.error = true;
        res.result_json = { { "content", error } };
        queue_results.push_back(res);
        condition_results.notify_all();
    }
    void add_multi_task(int id, std::vector<int>& sub_ids)
@ -1186,6 +1186,7 @@ struct llama_server_context
        multi.id = id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
        condition_tasks.notify_one();
    }
    void update_multi_task(int multitask_id, int subtask_id, task_result& result)
@ -1197,6 +1198,7 @@ struct llama_server_context
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
                condition_tasks.notify_one();
            }
        }
    }
@ -1215,7 +1217,7 @@ struct llama_server_context
            {"n_ctx",             slot.n_ctx},
            {"model",             params.model_alias},
            {"seed",              slot.params.seed},
-            {"temp",              slot.sparams.temp},
+            {"temperature",       slot.sparams.temp},
            {"top_k",             slot.sparams.top_k},
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
@ -1244,7 +1246,7 @@ struct llama_server_context
    void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
    {
-        std::lock_guard<std::mutex> lock(mutex_results);
+        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
        res.multitask_id = slot.multitask_id;
@ -1280,11 +1282,12 @@ struct llama_server_context
        }
        queue_results.push_back(res);
        condition_results.notify_all();
    }
    void send_final_response(llama_client_slot &slot)
    {
-        std::lock_guard<std::mutex> lock(mutex_results);
+        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
        res.multitask_id = slot.multitask_id;
@ -1340,11 +1343,12 @@ struct llama_server_context
        }
        queue_results.push_back(res);
        condition_results.notify_all();
    }
    void send_embedding(llama_client_slot &slot)
    {
-        std::lock_guard<std::mutex> lock(mutex_results);
+        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
        res.multitask_id = slot.multitask_id;
@ -1372,6 +1376,7 @@ struct llama_server_context
            };
        }
        queue_results.push_back(res);
        condition_results.notify_all();
    }
    int request_completion(json data, bool infill, bool embedding, int multitask_id)
@ -1395,6 +1400,7 @@ struct llama_server_context
        // otherwise, it's a single-prompt task, we actually queue it
        queue_tasks.push_back(task);
        condition_tasks.notify_one();
        return task.id;
    }
@ -1402,13 +1408,10 @@ struct llama_server_context
    {
        while (true)
        {
-            std::this_thread::sleep_for(std::chrono::microseconds(5));
+            std::unique_lock<std::mutex> lock(mutex_results);
-            std::lock_guard<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
-
+                return !queue_results.empty();
-            if (queue_results.empty())
+            });
            {
                continue;
            }
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
@ -1504,12 +1507,13 @@ struct llama_server_context
    void request_cancel(int task_id)
    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
+        std::unique_lock<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
        task.type = CANCEL_TASK;
        task.target_id = task_id;
        queue_tasks.push_back(task);
        condition_tasks.notify_one();
    }
    int split_multiprompt_task(task_server& multiprompt_task)
@ -1535,7 +1539,7 @@ struct llama_server_context
    void process_tasks()
    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
+        std::unique_lock<std::mutex> lock(mutex_tasks);
        while (!queue_tasks.empty())
        {
            task_server task = queue_tasks.front();
@ -1607,6 +1611,7 @@ struct llama_server_context
                std::lock_guard<std::mutex> lock(mutex_results);
                queue_results.push_back(aggregate_result);
                condition_results.notify_all();
                queue_iterator = queue_multitasks.erase(queue_iterator);
            }
@ -1637,8 +1642,10 @@ struct llama_server_context
                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
                kv_cache_clear();
            }
-            // avoid 100% usage of cpu all time
+            std::unique_lock<std::mutex> lock(mutex_tasks);
-            std::this_thread::sleep_for(std::chrono::milliseconds(5));
+            condition_tasks.wait(lock, [&]{
                return !queue_tasks.empty();
            });
        }
        for (llama_client_slot &slot : slots)
@ -2437,26 +2444,33 @@ json oaicompat_completion_params_parse(
    llama_params["__oaicompat"] = true;
    // Map OpenAI parameters to llama.cpp parameters
    //
    // For parameters that are defined by the OpenAI documentation (e.g.
    // temperature), we explicitly specify OpenAI's intended default; we
    // need to do that because sometimes OpenAI disagrees with llama.cpp
    //
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body, "model", std::string("uknown"));
    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
-    llama_params["temperature"]       = json_value(body, "temperature", 0.8);
+    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
-    llama_params["top_k"]             = json_value(body, "top_k", 40);
+    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body, "top_p", 0.95);
+    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"]              = json_value(body, "seed", 0);
+    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body, "stream", false);
-    llama_params["mirostat"]          = json_value(body, "mirostat", false);
+    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", 0.0);
+    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", 0.0);
+    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", false);
+    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body, "typical_p", 0.0);
+    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", 0);
+    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
-    llama_params["tfs_z"]             = json_value(body, "tfs_z", 0.0);
+    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);
    if (body.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
@ -3070,7 +3084,17 @@ int main(int argc, char **argv)
                {
                    prompt = "";
                }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
+
                json image_data;
                if (body.count("image_data") != 0) {
                    image_data = body["image_data"];
                }
                else
                {
                    image_data = "";
                }
                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
                task_result result = llama.next_result(task_id);
                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
            });
--- a/flake.lock
+++ b/flake.lock
@ -1,30 +1,30 @@
 {
  "nodes": {
-    "flake-utils": {
+    "flake-parts": {
      "inputs": {
-        "systems": "systems"
+        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1694529238,
+        "lastModified": 1701473968,
-        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
+        "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
-        "owner": "numtide",
+        "owner": "hercules-ci",
-        "repo": "flake-utils",
+        "repo": "flake-parts",
-        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
+        "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
        "type": "github"
      },
      "original": {
-        "owner": "numtide",
+        "owner": "hercules-ci",
-        "repo": "flake-utils",
+        "repo": "flake-parts",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1698318101,
+        "lastModified": 1703637592,
-        "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
+        "narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
+        "rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
        "type": "github"
      },
      "original": {
@ -34,26 +34,29 @@
        "type": "github"
      }
    },
-    "root": {
+    "nixpkgs-lib": {
      "inputs": {
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs"
      }
    },
    "systems": {
      "locked": {
-        "lastModified": 1681028828,
+        "dir": "lib",
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "lastModified": 1701253981,
-        "owner": "nix-systems",
+        "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
-        "repo": "default",
+        "owner": "NixOS",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "repo": "nixpkgs",
        "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
        "type": "github"
      },
      "original": {
-        "owner": "nix-systems",
+        "dir": "lib",
-        "repo": "default",
+        "owner": "NixOS",
        "ref": "nixos-unstable",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
        "flake-parts": "flake-parts",
        "nixpkgs": "nixpkgs"
      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@ -1,139 +1,144 @@
 {
  description = "Port of Facebook's LLaMA model in C/C++";
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    flake-utils.url = "github:numtide/flake-utils";
+    flake-parts.url = "github:hercules-ci/flake-parts";
  };
-  outputs = { self, nixpkgs, flake-utils }:
+
-    flake-utils.lib.eachDefaultSystem (system:
+  # Optional binary cache
-      let
+  nixConfig = {
-        name = "llama.cpp";
+    extra-substituters = [
-        src = ./.;
+      # Populated by the CI in ggerganov/llama.cpp
-        meta.mainProgram = "llama";
+      "https://llama-cpp.cachix.org"
-        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
+
-        buildInputs = with pkgs; [ openmpi ];
+      # A development cache for nixpkgs imported with `config.cudaSupport = true`.
-        osSpecific = with pkgs; buildInputs ++ (
+      # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
-          if isAarch64 && isDarwin then
+      # This lets one skip building e.g. the CUDA-enabled openmpi.
-            with pkgs.darwin.apple_sdk_11_0.frameworks; [
+      # TODO: Replace once nix-community obtains an official one.
-              Accelerate
+      "https://cuda-maintainers.cachix.org"
-              MetalKit
+    ];
-            ]
+
-          else if isAarch32 && isDarwin then
+    # Verify these are the same keys as published on
-            with pkgs.darwin.apple_sdk.frameworks; [
+    # - https://app.cachix.org/cache/llama-cpp
-              Accelerate
+    # - https://app.cachix.org/cache/cuda-maintainers
-              CoreGraphics
+    extra-trusted-public-keys = [
-              CoreVideo
+      "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
-            ]
+      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
-          else if isDarwin then
+    ];
-            with pkgs.darwin.apple_sdk.frameworks; [
+  };
-              Accelerate
+
-              CoreGraphics
+
-              CoreVideo
+  # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
-            ]
+  #
-          else
+  # ```bash
-            with pkgs; [ openblas ]
+  # ❯ nix repl
-        );
+  # nix-repl> :lf github:ggerganov/llama.cpp
-        pkgs = import nixpkgs { inherit system; };
+  # Added 13 variables.
-        nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
+  # nix-repl> outputs.apps.x86_64-linux.quantize
-        cudatoolkit_joined = with pkgs; symlinkJoin {
+  # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/quantize"; type = "app"; }
-          # HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
+  # ```
-          # see https://github.com/NixOS/nixpkgs/issues/224291
+  outputs =
-          # copied from jaxlib
+    { self, flake-parts, ... }@inputs:
-          name = "${cudaPackages.cudatoolkit.name}-merged";
+    let
-          paths = [
+      # We could include the git revisions in the package names but those would
-            cudaPackages.cudatoolkit.lib
+      # needlessly trigger rebuilds:
-            cudaPackages.cudatoolkit.out
+      # llamaVersion = self.dirtyShortRev or self.shortRev;
-          ] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
+
-            # for some reason some of the required libs are in the targets/x86_64-linux
+      # Nix already uses cryptographic hashes for versioning, so we'll just fix
-            # directory; not sure why but this works around it
+      # the fake semver for now:
-            "${cudaPackages.cudatoolkit}/targets/${system}"
+      llamaVersion = "0.0.0";
-          ];
+    in
-        };
+    flake-parts.lib.mkFlake { inherit inputs; }
-        llama-python =
+
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
        llama-python-extra =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
        postPatch = ''
          substituteInPlace ./ggml-metal.m \
            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
          substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
        '';
        postInstall = ''
          mv $out/bin/main $out/bin/llama
          mv $out/bin/server $out/bin/llama-server
          mkdir -p $out/include
          cp ${src}/llama.h $out/include/
        '';
        cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
      in
      {
-        packages.default = pkgs.stdenv.mkDerivation {
+
-          inherit name src meta postPatch nativeBuildInputs postInstall;
+        imports = [
-          buildInputs = osSpecific;
+          .devops/nix/nixpkgs-instances.nix
-          cmakeFlags = cmakeFlags
+          .devops/nix/apps.nix
-            ++ (if isAarch64 && isDarwin then [
+          .devops/nix/devshells.nix
-            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+          .devops/nix/jetson-support.nix
-            "-DLLAMA_METAL=ON"
+        ];
-          ] else [
+
-            "-DLLAMA_BLAS=ON"
+        # An overlay can be used to have a more granular control over llama-cpp's
-            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
+        # dependencies and configuration, than that offered by the `.override`
-          ]);
+        # mechanism. Cf. https://nixos.org/manual/nixpkgs/stable/#chap-overlays.
-        };
+        #
-        packages.opencl = pkgs.stdenv.mkDerivation {
+        # E.g. in a flake:
-          inherit name src meta postPatch nativeBuildInputs postInstall;
+        # ```
-          buildInputs = with pkgs; buildInputs ++ [ clblast ];
+        # { nixpkgs, llama-cpp, ... }:
-          cmakeFlags = cmakeFlags ++ [
+        # let pkgs = import nixpkgs {
-            "-DLLAMA_CLBLAST=ON"
+        #     overlays = [ (llama-cpp.overlays.default) ];
-          ];
+        #     system = "aarch64-linux";
-        };
+        #     config.allowUnfree = true;
-        packages.cuda = pkgs.stdenv.mkDerivation {
+        #     config.cudaSupport = true;
-          inherit name src meta postPatch nativeBuildInputs postInstall;
+        #     config.cudaCapabilities = [ "7.2" ];
-          buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
+        #     config.cudaEnableForwardCompat = false;
-          cmakeFlags = cmakeFlags ++ [
+        # }; in {
-            "-DLLAMA_CUBLAS=ON"
+        #     packages.aarch64-linux.llamaJetsonXavier = pkgs.llamaPackages.llama-cpp;
-          ];
+        # }
-        };
+        # ```
-        packages.rocm = pkgs.stdenv.mkDerivation {
+        #
-          inherit name src meta postPatch nativeBuildInputs postInstall;
+        # Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format
-          buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
+        flake.overlays.default =
-          cmakeFlags = cmakeFlags ++ [
+          (final: prev: {
-            "-DLLAMA_HIPBLAS=1"
+            llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-            "-DCMAKE_C_COMPILER=hipcc"
+            inherit (final.llamaPackages) llama-cpp;
-            "-DCMAKE_CXX_COMPILER=hipcc"
+          });
-            # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+
-            # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+        systems = [
-            # and select the line that matches the current nixpkgs version of rocBLAS.
+          "aarch64-darwin"
-            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+          "aarch64-linux"
-          ];
+          "x86_64-darwin" # x86_64-darwin isn't tested (and likely isn't relevant)
-        };
+          "x86_64-linux"
-        apps.llama-server = {
+        ];
-          type = "app";
+
-          program = "${self.packages.${system}.default}/bin/llama-server";
+        perSystem =
-        };
+          {
-        apps.llama-embedding = {
+            config,
-          type = "app";
+            lib,
-          program = "${self.packages.${system}.default}/bin/embedding";
+            system,
-        };
+            pkgs,
-        apps.llama = {
+            pkgsCuda,
-          type = "app";
+            pkgsRocm,
-          program = "${self.packages.${system}.default}/bin/llama";
+            ...
-        };
+          }:
-        apps.quantize = {
+          {
-          type = "app";
+            # Unlike `.#packages`, legacyPackages may contain values of
-          program = "${self.packages.${system}.default}/bin/quantize";
+            # arbitrary types (including nested attrsets) and may even throw
-        };
+            # exceptions. This attribute isn't recursed into by `nix flake
-        apps.train-text-from-scratch = {
+            # show` either.
-          type = "app";
+            #
-          program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
+            # You can add arbitrary scripts to `.devops/nix/scope.nix` and
-        };
+            # access them as `nix build .#llamaPackages.${scriptName}` using
-        apps.default = self.apps.${system}.llama;
+            # the same path you would with an overlay.
-        devShells.default = pkgs.mkShell {
+            legacyPackages = {
-          buildInputs = [ llama-python ];
+              llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-          packages = nativeBuildInputs ++ osSpecific;
+              llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-        };
+              llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
-        devShells.extra = pkgs.mkShell {
+            };
-          buildInputs = [ llama-python-extra ];
+
-          packages = nativeBuildInputs ++ osSpecific;
+            # We don't use the overlay here so as to avoid making too many instances of nixpkgs,
-        };
+            # cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
-      });
+            packages =
              {
                default = config.legacyPackages.llamaPackages.llama-cpp;
              }
              // lib.optionalAttrs pkgs.stdenv.isLinux {
                opencl = config.packages.default.override { useOpenCL = true; };
                cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
                mpi-cpu = config.packages.default.override { useMpi = true; };
                mpi-cuda = config.packages.default.override { useMpi = true; };
              }
              // lib.optionalAttrs (system == "x86_64-linux") {
                rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
              };
            # Packages exposed in `.#checks` will be built by the CI and by
            # `nix flake check`. Currently we expose all packages, but we could
            # make more granular choices
            checks = config.packages;
          };
      };
 }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -614,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
 }
 static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return true;
+    switch (op->op) {
        case GGML_OP_MUL_MAT:
            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
        default:
            return true;
    }
    GGML_UNUSED(backend);
    GGML_UNUSED(op);
 }
 static struct ggml_backend_i cpu_backend_i = {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -119,7 +119,9 @@
 #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define CC_VOLTA      700
 #define CC_OFFSET_AMD 1000000
 #define CC_RDNA1      (CC_OFFSET_AMD + 1010)
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
 #define CC_RDNA3      (CC_OFFSET_AMD + 1100)
 #define GGML_CUDA_MAX_NODES 8192
@ -133,7 +135,6 @@
 // TODO: improve this to be correct for more hardware
 //       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
 //       probably other such cases, and not sure what happens on AMD hardware
 #if !defined(GGML_CUDA_FORCE_MMQ)
 #define CUDA_USE_TENSOR_CORES
 #endif
@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
 // pool with virtual memory
 static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
 static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
-static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
+static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
 static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
@ -7485,6 +7486,8 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_CUDA_F16
    cuda_pool_alloc<half> src1_dfloat_a;
@ -7577,6 +7580,7 @@ static void ggml_cuda_op_mul_mat_cublas(
    const int compute_capability = g_device_caps[id].cc;
    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
        //printf("this branch\n");
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
        cuda_pool_alloc<half> src0_as_f16;
        if (src0->type != GGML_TYPE_F16) {
@ -7614,9 +7618,9 @@ static void ggml_cuda_op_mul_mat_cublas(
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    }
+    } else {
    else {
        cuda_pool_alloc<float> src0_ddq_as_f32;
        cuda_pool_alloc<float> src1_ddq_as_f32;
        if (src0->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
@ -7624,7 +7628,15 @@ static void ggml_cuda_op_mul_mat_cublas(
            src0_ddq_as_f32.alloc(row_diff*ne00);
            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
        }
        if (src1->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
            GGML_ASSERT(to_fp32_cuda != nullptr);
            src1_ddq_as_f32.alloc(src1_ncols*ne10);
            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
        }
        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
        const float alpha = 1.0f;
        const float beta = 0.0f;
@ -7633,9 +7645,9 @@ static void ggml_cuda_op_mul_mat_cublas(
        CUBLAS_CHECK(
            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                    row_diff, src1_ncols, ne10,
-                    &alpha, src0_ddf_i, ne00,
+                    &alpha, src0_ddf_i,  ne00,
-                            src1_ddf_i, ne10,
+                            src1_ddf1_i, ne10,
-                    &beta,  dst_dd_i,   ldc));
+                    &beta,  dst_dd_i,    ldc));
    }
    (void) dst;
@ -8035,6 +8047,7 @@ static void ggml_cuda_op_mul_mat(
    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
@ -8481,9 +8494,9 @@ static __global__ void k_compute_batched_ptrs(
    int64_t i03 = i13 / r3;
    int64_t i02 = i12 / r2;
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2   + i13*nbd3;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
 }
 static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -8492,28 +8505,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
+    GGML_TENSOR_BINARY_OP_LOCALS
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    const int64_t ne03 = src0->ne[3];
-    const int64_t nb01 = src0->nb[1];
+    const int64_t ne_dst = ggml_nelements(dst);
    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int64_t nb11 = src1->nb[1];
    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
    const int64_t ne1 = ggml_nelements(src1);
    const int64_t ne  = ggml_nelements(dst);
    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
@ -8522,7 +8517,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    void * src0_ddq = src0_extra->data_device[g_main_device];
-    half * src0_as_f16 = (half *) src0_ddq;
+    half * src0_f16 = (half *) src0_ddq;
    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
@ -8531,11 +8526,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+    cuda_pool_alloc<half> src1_f16_alloc;
-    GGML_ASSERT(to_fp16_cuda != nullptr);
+    if (src1->type != GGML_TYPE_F16) {
-
+        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    cuda_pool_alloc<half> src1_as_f16(ne1);
+        const int64_t ne_src1 = ggml_nelements(src1);
-    to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
+        src1_f16_alloc.alloc(ne_src1);
        GGML_ASSERT(to_fp16_cuda != nullptr);
        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
    }
    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
    cuda_pool_alloc<half> dst_f16;
    char * dst_t;
@ -8557,7 +8556,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    const void * beta  = &beta_f16;
    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        dst_t = (char *) dst_f16.alloc(ne);
+        dst_t = (char *) dst_f16.alloc(ne_dst);
        nbd2 /= sizeof(float) / sizeof(half);
        nbd3 /= sizeof(float) / sizeof(half);
@ -8604,9 +8603,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                alpha, (const char *) src0_as_f16,       CUDA_R_16F,   nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
+                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
-                       (const char *) src1_as_f16.get(), CUDA_R_16F,   nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
+                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
-                beta,  (      char *)       dst_t,       cu_data_type, ne01,                dst->nb[2]/sizeof(float), // strideC
+                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
                ne12*ne13,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@ -8619,12 +8618,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        dim3 block_dims(ne13, ne12);
        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
-                src0_as_f16, src1_as_f16.get(), dst_t,
+                src0_f16, src1_f16, dst_t,
                ptrs_src.get(), ptrs_dst.get(),
                ne12, ne13,
                ne23,
                nb02, nb03,
-                nb12, nb13,
+                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
                nbd2, nbd3,
                r2, r3);
        CUDA_CHECK(cudaGetLastError());
@ -8632,8 +8632,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/sizeof(half),
+                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
-                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/sizeof(float),
+                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
                ne23,
                cu_compute_type,
@ -8643,7 +8643,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
+        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
    }
 }
@ -8662,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
        }
    }
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
    bool               use_mul_mat_q = ggml_is_quantized(src0->type);
 #ifdef CUDA_USE_TENSOR_CORES
-    const bool use_tensor_cores = true;
+    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
 #endif // CUDA_USE_TENSOR_CORES
 #else
-    const bool use_tensor_cores = false;
+
-#endif
+    const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
    bool               use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
 #ifdef CUDA_USE_TENSOR_CORES
    // when tensor cores are available, use them for large batch size
    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
    use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
 #endif // CUDA_USE_TENSOR_CORES
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
    // debug helpers
    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
@ -8676,19 +8690,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+    } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
    } else if (src0->type == GGML_TYPE_F32) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
 #ifdef GGML_CUDA_FORCE_DMMV
            const bool use_mul_mat_vec_q = false;
 #else
@ -8702,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
            }
        } else {
            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
            // when tensor cores are available, use them for large batch size
            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
                use_mul_mat_q = false;
            }
            if (use_mul_mat_q) {
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
            } else {
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -6,19 +6,19 @@
 extern "C" {
 #endif
-void ggml_cl_init(void);
+GGML_API void ggml_cl_init(void);
-void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
-void * ggml_cl_host_malloc(size_t size);
+GGML_API void * ggml_cl_host_malloc(size_t size);
-void   ggml_cl_host_free(void * ptr);
+GGML_API void   ggml_cl_host_free(void * ptr);
-void ggml_cl_free_data(const struct ggml_tensor* tensor);
+GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
-void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
 #ifdef  __cplusplus
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -410,13 +410,17 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
 #if !defined(__ARM_FEATURE_DOTPROD)
-inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
 }
 #else
 #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
 #endif
 #endif
@ -2481,8 +2485,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
@ -2769,8 +2773,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
@ -2936,11 +2940,11 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }
    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3228,11 +3232,11 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
    }
    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@ -3483,12 +3487,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }
    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3598,8 +3602,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 // We use this macro instead of a function call because for some reason
 // the code runs 2-3% slower, even if the function is declared inline
 #define MULTIPLY_ACCUM_WITH_SCALE(index)\
-        isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
-        isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
 #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
@ -3973,10 +3977,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
        q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
-        isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
+        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
+        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
-        isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
+        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
-        isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
+        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
        sum += d * (isum1 + isum2);
    }
@ -4256,10 +4260,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
            scale += 4;
@ -4273,10 +4277,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
            scale += 4;
@ -4757,10 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
        q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6),                q3h.val[3]));
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
        sum += d * isum;
@ -5109,14 +5113,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-            const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
            sumi1 += vaddvq_s32(p1) * scales[2*j+0];
            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-            const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
        }
@ -5449,13 +5453,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
        q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-        const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+        const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
        const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
        q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
        q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-        const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
+        const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
        const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
        sumf += d * (sumi1 + sumi2);
@ -5722,8 +5726,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
-            sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
-            sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
        }
        sumf += d * sumi - dmin * sumi_mins;
@ -6112,10 +6116,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
        q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
-        int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
+        int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
-        int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
+        int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
-        int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
+        int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
-        int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
+        int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
    }
@ -6399,10 +6403,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
-            isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
            scale += 4;
@ -6426,10 +6430,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
-            isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
            scale += 4;
        }
        //sum += isum * d_all * y[i].d;
@ -6816,10 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
        q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
        q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
-        isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
        sum += isum * d_all * y[i].d;
--- a/ggml.c
+++ b/ggml.c
@ -9687,7 +9687,7 @@ static void ggml_compute_forward_mul_mat(
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
            assert(params->wsize >= ne11*ne12*ne13*row_size);
-            assert(src1->type == GGML_TYPE_F32);
+            GGML_ASSERT(src1->type == GGML_TYPE_F32);
            for (int64_t i13 = 0; i13 < ne13; ++i13) {
                for (int64_t i12 = 0; i12 < ne12; ++i12) {
@ -19638,6 +19638,14 @@ int ggml_cpu_has_avx(void) {
 #endif
 }
 int ggml_cpu_has_avx_vnni(void) {
 #if defined(__AVXVNNI__)
    return 1;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_avx2(void) {
 #if defined(__AVX2__)
    return 1;
--- a/ggml.h
+++ b/ggml.h
@ -2198,6 +2198,7 @@ extern "C" {
    //
    GGML_API int ggml_cpu_has_avx        (void);
    GGML_API int ggml_cpu_has_avx_vnni   (void);
    GGML_API int ggml_cpu_has_avx2       (void);
    GGML_API int ggml_cpu_has_avx512     (void);
    GGML_API int ggml_cpu_has_avx512_vbmi(void);
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -370,7 +370,16 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.GPT2: [
-        # TODO
+        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.PHI2: [
        MODEL_TENSOR.TOKEN_EMBD,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -17,6 +17,7 @@ class TensorNameMap:
            "tok_embeddings",                            # llama-pth
            "embeddings.word_embeddings",                # bert
            "language_model.embedding.word_embeddings",  # persimmon
            "wte",                                       # gpt2
            "transformer.embd.wte",                      # phi2
        ),
@ -34,6 +35,7 @@ class TensorNameMap:
        MODEL_TENSOR.POS_EMBD: (
            "transformer.wpe",                 # gpt2
            "embeddings.position_embeddings",  # bert
            "wpe",                             # gpt2
        ),
        # Output
@ -53,7 +55,7 @@ class TensorNameMap:
            "norm",                                    # llama-pth
            "embeddings.LayerNorm",                    # bert
            "transformer.norm_f",                      # mpt
-            "ln_f",                                    # refact bloom qwen
+            "ln_f",                                    # refact bloom qwen gpt2
            "language_model.encoder.final_layernorm",  # persimmon
            "lm_head.ln",                              # phi2
        ),
@ -78,6 +80,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.LayerNorm",       # bert
            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
            "model.layers.{bid}.ln1",                               # yi
            "h.{bid}.ln_1",                                         # gpt2
            "transformer.h.{bid}.ln",                               # phi2
            "model.layers.layers.{bid}.norm",                       # plamo
        ),
@ -95,6 +98,7 @@ class TensorNameMap:
            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
            "h.{bid}.self_attention.query_key_value",                              # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
            "h.{bid}.attn.c_attn",                                                 # gpt2
            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
        ),
@ -137,6 +141,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.dense",                # bert
            "transformer.h.{bid}.attn.out_proj",                         # gpt-j
            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
            "h.{bid}.attn.c_proj",                                       # gpt2
            "transformer.h.{bid}.mixer.out_proj",                        # phi2
            "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
        ),
@ -159,6 +164,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.output.LayerNorm",                          # bert
            "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
            "model.layers.{bid}.ln2",                                        # yi
            "h.{bid}.ln_2",                                                  # gpt2
        ),
        MODEL_TENSOR.FFN_GATE_INP: (
@ -179,6 +185,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
            "transformer.h.{bid}.mlp.w1",                             # qwen
            "h.{bid}.mlp.c_fc",                                       # gpt2
            "transformer.h.{bid}.mlp.fc1",                            # phi2
            "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
        ),
@ -218,6 +225,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.output.dense",                       # bert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
            "h.{bid}.mlp.c_proj",                                     # gpt2
            "transformer.h.{bid}.mlp.fc2",                            # phi2
            "model.layers.layers.{bid}.mlp.down_proj",                # plamo
        ),
--- a/llama.cpp
+++ b/llama.cpp
@ -423,6 +423,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
        LLM_ARCH_GPT2,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_POS_EMBD,        "position_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
        },
    },
    {
@ -1256,6 +1265,10 @@ enum e_model {
    MODEL_40B,
    MODEL_65B,
    MODEL_70B,
    MODEL_SMALL,
    MODEL_MEDIUM,
    MODEL_LARGE,
    MODEL_XL,
 };
 static const size_t kiB = 1024;
@ -2552,18 +2565,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
 static const char * llama_model_type_name(e_model type) {
    switch (type) {
-        case MODEL_1B:  return "1B";
+        case MODEL_1B:     return "1B";
-        case MODEL_3B:  return "3B";
+        case MODEL_3B:     return "3B";
-        case MODEL_7B:  return "7B";
+        case MODEL_7B:     return "7B";
-        case MODEL_8B:  return "8B";
+        case MODEL_8B:     return "8B";
-        case MODEL_13B: return "13B";
+        case MODEL_13B:    return "13B";
-        case MODEL_15B: return "15B";
+        case MODEL_15B:    return "15B";
-        case MODEL_30B: return "30B";
+        case MODEL_30B:    return "30B";
-        case MODEL_34B: return "34B";
+        case MODEL_34B:    return "34B";
-        case MODEL_40B: return "40B";
+        case MODEL_40B:    return "40B";
-        case MODEL_65B: return "65B";
+        case MODEL_65B:    return "65B";
-        case MODEL_70B: return "70B";
+        case MODEL_70B:    return "70B";
-        default:        return "?B";
+        case MODEL_SMALL:  return "0.1B";
        case MODEL_MEDIUM: return "0.4B";
        case MODEL_LARGE:  return "0.8B";
        case MODEL_XL:     return "1.5B";
        default:           return "?B";
    }
 }
@ -2782,6 +2799,17 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
               }
            } break;
        case LLM_ARCH_GPT2:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                switch (hparams.n_layer) {
                    case 12: model.type = e_model::MODEL_SMALL; break;
                    case 24: model.type = e_model::MODEL_MEDIUM; break;
                    case 36: model.type = e_model::MODEL_LARGE; break;
                    case 48: model.type = e_model::MODEL_XL; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
        default: (void)0;
    }
@ -3710,6 +3738,60 @@ static bool llm_load_tensors(
                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
                    }
                } break;
            case LLM_ARCH_GPT2:
                {
                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
                    // output
                    {
                        ggml_backend_type backend_norm;
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            backend_norm   = llama_backend_offload;
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
                        }
                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
                    }
                    const uint32_t n_ff = hparams.n_ff;
                    const int i_gpu_start = n_layer - n_gpu_layers;
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
                        auto & layer = model.layers[i];
                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
                    }
                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@ -5754,6 +5836,102 @@ struct llm_build_context {
        return gf;
    }
    struct ggml_cgraph * build_gpt2() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
        struct ggml_tensor * cur;
        struct ggml_tensor * pos;
        struct ggml_tensor * inpL;
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
        cb(inpL, "inp_embd", -1);
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
        cb(inp_pos, "inp_pos", -1);
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
        cb(KQ_mask, "KQ_mask", -1);
        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
        cb(pos, "pos_embd", -1);
        inpL = ggml_add(ctx0, inpL, pos);
        cb(inpL, "inpL", -1);
        for (int il = 0; il < n_layer; ++il) {
            cur = llm_build_norm(ctx0, inpL, hparams,
                    model.layers[il].attn_norm,
                    model.layers[il].attn_norm_b,
                    LLM_NORM, cb, il);
            cb(cur, "attn_norm", il);
            // self-attention
            {
                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                cb(cur, "wqkv", il);
                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
                cb(cur, "bqkv", il);
                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
            // add the input
            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
            cb(ffn_inp, "ffn_inp", il);
            // FF
            {
                cur = llm_build_norm(ctx0, ffn_inp, hparams,
                        model.layers[il].ffn_norm,
                        model.layers[il].ffn_norm_b,
                        LLM_NORM, cb, il);
                cb(cur, "ffn_norm", il);
                cur = llm_build_ffn(ctx0, cur,
                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
                        NULL,                      NULL,
                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                        NULL,
                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
            }
            inpL = ggml_add(ctx0, cur, ffn_inp);
            cb(inpL, "l_out", il);
        }
        cur = llm_build_norm(ctx0, inpL, hparams,
                model.output_norm,
                model.output_norm_b,
                LLM_NORM, cb, -1);
        cb(cur, "result_norm", -1);
        cur = ggml_mul_mat(ctx0, model.output, cur);
        cb(cur, "result_output", -1);
        ggml_build_forward_expand(gf, cur);
        return gf;
    }
 };
 //
@ -6269,6 +6447,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_plamo();
            } break;
        case LLM_ARCH_GPT2:
            {
                result = llm.build_gpt2();
            } break;
        default:
            GGML_ASSERT(false);
    }
@ -10598,6 +10780,7 @@ const char * llama_print_system_info(void) {
    s  = "";
    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
    s += "AVX_VNNI = "    + std::to_string(ggml_cpu_has_avx_vnni())    + " | ";
    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
--- a/models/ggml-vocab-gpt2.gguf
+++ b/models/ggml-vocab-gpt2.gguf
--- a/requirements-hf-to-gguf.txt
+++ b/requirements-hf-to-gguf.txt
@ -1,3 +0,0 @@
 -r requirements.txt
 torch==2.1.1
 transformers==4.35.2
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,12 @@
-numpy==1.24.4
+# These requirements include all dependencies for all top-level python scripts
-sentencepiece==0.1.98
+# for llama.cpp. Avoid adding packages here directly.
-transformers>=4.34.0
+#
-gguf>=0.1.0
+# Package versions must stay compatible across all top-level python scripts.
-protobuf>=4.21.0
+#
 -r ./requirements/requirements-convert.txt
 -r ./requirements/requirements-convert-hf-to-gguf.txt
 -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
 -r ./requirements/requirements-convert-lora-to-ggml.txt
 -r ./requirements/requirements-convert-persimmon-to-gguf.txt
--- a/requirements/requirements-convert-hf-to-gguf.txt
+++ b/requirements/requirements-convert-hf-to-gguf.txt
@ -0,0 +1,2 @@
 -r ./requirements-convert.txt
 torch~=2.1.1
--- a/requirements/requirements-convert-llama-ggml-to-gguf.txt
+++ b/requirements/requirements-convert-llama-ggml-to-gguf.txt
@ -0,0 +1 @@
 -r ./requirements-convert.txt
--- a/requirements/requirements-convert-lora-to-ggml.txt
+++ b/requirements/requirements-convert-lora-to-ggml.txt
@ -0,0 +1,2 @@
 -r ./requirements-convert.txt
 torch~=2.1.1
--- a/requirements/requirements-convert-persimmon-to-gguf.txt
+++ b/requirements/requirements-convert-persimmon-to-gguf.txt
@ -0,0 +1,2 @@
 -r ./requirements-convert.txt
 torch~=2.1.1
--- a/requirements/requirements-convert.txt
+++ b/requirements/requirements-convert.txt
@ -0,0 +1,5 @@
 numpy~=1.24.4
 sentencepiece~=0.1.98
 transformers>=4.35.2,<5.0.0
 gguf>=0.1.0
 protobuf>=4.21.0,<5.0.0
--- a/scripts/check-requirements.sh
+++ b/scripts/check-requirements.sh
@ -0,0 +1,174 @@
 #!/bin/bash
 set -euo pipefail
 #
 # check-requirements.sh checks all requirements files for each top-level
 # convert*.py script.
 #
 # WARNING: This is quite IO intensive, because a fresh venv is set up for every
 # python script. As of 2023-12-22, this writes ~2.7GB of data. An adequately
 # sized tmpfs /tmp or ramdisk is recommended if running this frequently.
 #
 # usage:    check-requirements.sh [<working_dir>]
 #           check-requirements.sh nocleanup [<working_dir>]
 #
 # where:
 #           - <working_dir> is a directory that can be used as the base for
 #               setting up the venvs. Defaults to `/tmp`.
 #           - 'nocleanup' as the first argument will disable automatic cleanup
 #               of the files created by this script.
 #
 # requires:
 #           - bash >= 3.2.57
 #           - shellcheck
 #
 # For each script, it creates a fresh venv, `pip install`s the requirements, and
 # finally imports the python script to check for `ImportError`.
 #
 log() {
    local level=$1 msg=$2
    printf >&2 '%s: %s\n' "$level" "$msg"
 }
 debug() {
    log DEBUG "$@"
 }
 info() {
    log INFO "$@"
 }
 fatal() {
    log FATAL "$@"
    exit 1
 }
 cleanup() {
    if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
        info "Removing $workdir"
        local count=0
        rm -rfv -- "$workdir" | while read -r; do
            if (( count++ > 750 )); then
                printf .
                count=0
            fi
        done
        printf '\n'
        info "Removed $workdir"
    fi
 }
 do_cleanup=1
 if [[ ${1-} == nocleanup ]]; then
    do_cleanup=0; shift
 fi
 if (( do_cleanup )); then
    trap exit INT TERM
    trap cleanup EXIT
 fi
 this=$(realpath -- "$0"); readonly this
 cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory
 shellcheck "$this"
 readonly reqs_dir=requirements
 if [[ ${1+x} ]]; then
    tmp_dir=$(realpath -- "$1")
    if [[ ! ( -d $tmp_dir && -w $tmp_dir ) ]]; then
        fatal "$tmp_dir is not a writable directory"
    fi
 else
    tmp_dir=/tmp
 fi
 workdir=$(mktemp -d "$tmp_dir/check-requirements.XXXX"); readonly workdir
 info "Working directory: $workdir"
 check_requirements() {
    local reqs=$1
    info "$reqs: beginning check"
    pip --disable-pip-version-check install -qr "$reqs"
    info "$reqs: OK"
 }
 check_convert_script() {
    local py=$1             # e.g. ./convert-hf-to-gguf.py
    local pyname=${py##*/}  # e.g. convert-hf-to-gguf.py
    pyname=${pyname%.py}    # e.g. convert-hf-to-gguf
    info "$py: beginning check"
    local reqs="$reqs_dir/requirements-$pyname.txt"
    if [[ ! -r $reqs ]]; then
        fatal "$py missing requirements. Expected: $reqs"
    fi
    local venv="$workdir/$pyname-venv"
    python3 -m venv "$venv"
    (
        # shellcheck source=/dev/null
        source "$venv/bin/activate"
        check_requirements "$reqs"
        python - "$py" "$pyname" <<'EOF'
 import sys
 from importlib.machinery import SourceFileLoader
 py, pyname = sys.argv[1:]
 SourceFileLoader(pyname, py).load_module()
 EOF
    )
    if (( do_cleanup )); then
        rm -rf -- "$venv"
    fi
    info "$py: imports OK"
 }
 readonly ignore_eq_eq='check_requirements: ignore "=="'
 for req in "$reqs_dir"/*; do
    # Check that all sub-requirements are added to top-level requirements.txt
    if ! grep -qF "$req" requirements.txt; then
        fatal "$req needs to be added to requirements.txt"
    fi
    # Make sure exact release versions aren't being pinned in the requirements
    # Filters out the ignore string
    if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then
        tab=$'\t'
        cat >&2 <<EOF
 FATAL: Avoid pinning exact package versions. Use '~=' instead.
 You can suppress this error by appending the following to the line:
 $tab# $ignore_eq_eq
 EOF
        exit 1
    fi
 done
 all_venv="$workdir/all-venv"
 python3 -m venv "$all_venv"
 (
    # shellcheck source=/dev/null
    source "$all_venv/bin/activate"
    check_requirements requirements.txt
 )
 if (( do_cleanup )); then
    rm -rf -- "$all_venv"
 fi
 check_convert_script convert.py
 for py in convert-*.py; do
    check_convert_script "$py"
 done
 info 'Done! No issues found.'
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -27,21 +27,36 @@ echo "Syncing ggml changes since commit $lc"
 cd $SRC_GGML
 git log --oneline $lc..HEAD
 git log --oneline $lc..HEAD | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits
-git format-patch $lc --stdout -- \
+if [ ! -s $SRC_LLAMA/ggml-commits ]; then
-    include/ggml/ggml*.h \
+    rm -v $SRC_LLAMA/ggml-commits
-    src/ggml*.h \
+    echo "No new commits"
-    src/ggml*.c \
+    exit 0
-    src/ggml*.cpp \
+fi
-    src/ggml*.m \
+
-    src/ggml*.metal \
+if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-    src/ggml*.cu \
+    rm -v $SRC_LLAMA/ggml-src.patch
-    tests/test-opt.cpp \
+fi
-    tests/test-grad0.cpp \
+
-    tests/test-quantize-fns.cpp \
+while read c; do
-    tests/test-quantize-perf.cpp \
+    git format-patch -k $c~1..$c --stdout -- \
-    tests/test-backend-ops.cpp \
+        include/ggml/ggml*.h \
-    > $SRC_LLAMA/ggml-src.patch
+        src/ggml*.h \
        src/ggml*.c \
        src/ggml*.cpp \
        src/ggml*.m \
        src/ggml*.metal \
        src/ggml*.cu \
        tests/test-opt.cpp \
        tests/test-grad0.cpp \
        tests/test-quantize-fns.cpp \
        tests/test-quantize-perf.cpp \
        tests/test-backend-ops.cpp \
        >> $SRC_LLAMA/ggml-src.patch
 done < $SRC_LLAMA/ggml-commits
 rm -v $SRC_LLAMA/ggml-commits
 # delete files if empty
 if [ ! -s $SRC_LLAMA/ggml-src.patch ]; then
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-76e7f47b69e8334384dc718480c496dafbd47999
+df098ea908764cba4a4889a1cbe7b026b2d31a14
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -2,7 +2,7 @@ function(llama_build_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_executable(${TEST_TARGET} ${source})
    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
 endfunction()
 function(llama_test_executable name source)
@ -14,7 +14,7 @@ function(llama_build_and_test_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_executable(${TEST_TARGET} ${source})
    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
@ -41,6 +41,7 @@ llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cp
 llama_test_executable (test-tokenizer-1-gpt-neox         test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
 llama_test_executable (test-tokenizer-1-refact           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test_executable (test-tokenizer-1-starcoder        test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_test_executable (test-tokenizer-1-gpt2             test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
 # llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
 llama_build_and_test_executable(test-grammar-parser.cpp)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -350,13 +350,18 @@ struct test_case {
        fflush(stdout);
        // check if backends support op
        bool supported = true;
        for (ggml_backend_t backend : {backend1, backend2}) {
            if (!ggml_backend_supports_op(backend, out)) {
-                printf("not supported\n");
+                printf("not supported [%s] ", ggml_backend_name(backend));
-                ggml_free(ctx);
+                supported = false;
                return true;
            }
        }
        if (!supported) {
            printf("\n");
            ggml_free(ctx);
            return true;
        }
        // post-graph sentinel
        add_sentinel(ctx);
@ -1505,8 +1510,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }
    for (ggml_type type_a : all_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
            // FIXME: CPU crashes on f16xf16
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {1, 1}));
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {2, 1}));
		`@ -0,0 +1,2 @@`
							`-r ./requirements-convert.txt`
							`torch~=2.1.1`
`@ -1 +1 @@`
	`76e7f47b69e8334384dc718480c496dafbd47999`	`df098ea908764cba4a4889a1cbe7b026b2d31a14`