Merge branch 'master' of https://github.com/ggerganov/llama.cpp

2024-01-02 14:54:05 +07:00 · 2024-01-02 14:54:05 +07:00 · 6c46cb1da4
commit 6c46cb1da4
parent 9174699f84 edd1ab7bc3
58 changed files with 1982 additions and 885 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -15,6 +15,7 @@ RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git

 COPY requirements.txt   requirements.txt
+COPY requirements       requirements

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -24,6 +24,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1102

 COPY requirements.txt   requirements.txt
+COPY requirements       requirements

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -6,6 +6,7 @@ RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git

 COPY requirements.txt   requirements.txt
+COPY requirements       requirements

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -24,6 +24,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1102

 COPY requirements.txt   requirements.txt
+COPY requirements       requirements

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -0,0 +1,22 @@
+{
+  perSystem =
+    { config, lib, ... }:
+    {
+      apps =
+        let
+          inherit (config.packages) default;
+          binaries = [
+            "llama"
+            "llama-embedding"
+            "llama-server"
+            "quantize"
+            "train-text-from-scratch"
+          ];
+          mkApp = name: {
+            type = "app";
+            program = "${default}/bin/${name}";
+          };
+        in
+        lib.genAttrs binaries mkApp;
+    };
+}
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@ -0,0 +1,13 @@
+{
+  perSystem =
+    { config, lib, ... }:
+    {
+      devShells =
+        lib.concatMapAttrs
+          (name: package: {
+            ${name} = package.passthru.shell;
+            ${name + "-extra"} = package.passthru.shell-extra;
+          })
+          config.packages;
+    };
+}
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@ -0,0 +1,39 @@
+{ inputs, ... }:
+{
+  perSystem =
+    {
+      config,
+      system,
+      lib,
+      pkgsCuda,
+      ...
+    }:
+    {
+      legacyPackages =
+        let
+          caps.llamaPackagesXavier = "7.2";
+          caps.llamaPackagesOrin = "8.7";
+          caps.llamaPackagesTX2 = "6.2";
+          caps.llamaPackagesNano = "5.3";
+
+          pkgsFor =
+            cap:
+            import inputs.nixpkgs {
+              inherit system;
+              config = {
+                cudaSupport = true;
+                cudaCapabilities = [ cap ];
+                cudaEnableForwardCompat = false;
+                inherit (pkgsCuda.config) allowUnfreePredicate;
+              };
+            };
+        in
+        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
+
+      packages = lib.optionalAttrs (system == "aarch64-linux") {
+        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
+        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
+        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
+      };
+    };
+}
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@ -0,0 +1,35 @@
+{ inputs, ... }:
+{
+  # The _module.args definitions are passed on to modules as arguments. E.g.
+  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
+  # `_module.args.pkgs` (defined in this case by flake-parts).
+  perSystem =
+    { system, ... }:
+    {
+      _module.args = {
+        pkgsCuda = import inputs.nixpkgs {
+          inherit system;
+          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
+          # and ucx are built with CUDA support)
+          config.cudaSupport = true;
+          config.allowUnfreePredicate =
+            p:
+            builtins.all
+              (
+                license:
+                license.free
+                || builtins.elem license.shortName [
+                  "CUDA EULA"
+                  "cuDNN EULA"
+                ]
+              )
+              (p.meta.licenses or [ p.meta.license ]);
+        };
+        # Ensure dependencies use ROCm consistently
+        pkgsRocm = import inputs.nixpkgs {
+          inherit system;
+          config.rocmSupport = true;
+        };
+      };
+    };
+}
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -0,0 +1,265 @@
+{
+  lib,
+  config,
+  stdenv,
+  mkShell,
+  cmake,
+  ninja,
+  pkg-config,
+  git,
+  python3,
+  mpi,
+  openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
+  cudaPackages,
+  darwin,
+  rocmPackages,
+  clblast,
+  useBlas ? builtins.all (x: !x) [
+    useCuda
+    useMetalKit
+    useOpenCL
+    useRocm
+  ],
+  useCuda ? config.cudaSupport,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
+  useMpi ? false, # Increases the runtime closure size by ~700M
+  useOpenCL ? false,
+  useRocm ? config.rocmSupport,
+  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
+}@inputs:
+
+let
+  inherit (lib)
+    cmakeBool
+    cmakeFeature
+    optionals
+    strings
+    versionOlder
+    ;
+
+  # It's necessary to consistently use backendStdenv when building with CUDA support,
+  # otherwise we get libstdc++ errors downstream.
+  stdenv = throw "Use effectiveStdenv instead";
+  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
+
+  suffices =
+    lib.optionals useBlas [ "BLAS" ]
+    ++ lib.optionals useCuda [ "CUDA" ]
+    ++ lib.optionals useMetalKit [ "MetalKit" ]
+    ++ lib.optionals useMpi [ "MPI" ]
+    ++ lib.optionals useOpenCL [ "OpenCL" ]
+    ++ lib.optionals useRocm [ "ROCm" ];
+
+  pnameSuffix =
+    strings.optionalString (suffices != [ ])
+      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
+  descriptionSuffix =
+    strings.optionalString (suffices != [ ])
+      ", accelerated with ${strings.concatStringsSep ", " suffices}";
+
+  # TODO: package the Python in this repository in a Nix-like way.
+  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
+  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
+  # https://peps.python.org/pep-0517/
+  llama-python = python3.withPackages (
+    ps: [
+      ps.numpy
+      ps.sentencepiece
+    ]
+  );
+
+  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
+  llama-python-extra = python3.withPackages (
+    ps: [
+      ps.numpy
+      ps.sentencepiece
+      ps.torchWithoutCuda
+      ps.transformers
+    ]
+  );
+
+  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
+  # separately
+  darwinBuildInputs =
+    with darwin.apple_sdk.frameworks;
+    [
+      Accelerate
+      CoreVideo
+      CoreGraphics
+    ]
+    ++ optionals useMetalKit [ MetalKit ];
+
+  cudaBuildInputs = with cudaPackages; [
+    cuda_cccl.dev # <nv/target>
+
+    # A temporary hack for reducing the closure size, remove once cudaPackages
+    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
+    cuda_cudart.dev
+    cuda_cudart.lib
+    cuda_cudart.static
+    libcublas.dev
+    libcublas.lib
+    libcublas.static
+  ];
+
+  rocmBuildInputs = with rocmPackages; [
+    clr
+    hipblas
+    rocblas
+  ];
+in
+
+effectiveStdenv.mkDerivation (
+  finalAttrs: {
+    pname = "llama-cpp${pnameSuffix}";
+    version = llamaVersion;
+
+    src = lib.cleanSourceWith {
+      filter =
+        name: type:
+        !(builtins.any (_: _) [
+          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+          (name == "README.md") # Ignore *.md changes whe computing outPaths
+          (lib.hasPrefix "." name) # Skip hidden files and directories
+        ]);
+      src = lib.cleanSource ../../.;
+    };
+
+    postPatch = ''
+      substituteInPlace ./ggml-metal.m \
+        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+
+      # TODO: Package up each Python script or service appropriately.
+      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
+      # we could make those *.py into setuptools' entrypoints
+      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
+    '';
+
+    nativeBuildInputs =
+      [
+        cmake
+        ninja
+        pkg-config
+        git
+      ]
+      ++ optionals useCuda [
+        cudaPackages.cuda_nvcc
+
+        # TODO: Replace with autoAddDriverRunpath
+        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
+        cudaPackages.autoAddOpenGLRunpathHook
+      ];
+
+    buildInputs =
+      optionals effectiveStdenv.isDarwin darwinBuildInputs
+      ++ optionals useCuda cudaBuildInputs
+      ++ optionals useMpi [ mpi ]
+      ++ optionals useOpenCL [ clblast ]
+      ++ optionals useRocm rocmBuildInputs;
+
+    cmakeFlags =
+      [
+        (cmakeBool "LLAMA_NATIVE" true)
+        (cmakeBool "LLAMA_BUILD_SERVER" true)
+        (cmakeBool "BUILD_SHARED_LIBS" true)
+        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+        (cmakeBool "LLAMA_BLAS" useBlas)
+        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
+        (cmakeBool "LLAMA_CUBLAS" useCuda)
+        (cmakeBool "LLAMA_HIPBLAS" useRocm)
+        (cmakeBool "LLAMA_METAL" useMetalKit)
+        (cmakeBool "LLAMA_MPI" useMpi)
+      ]
+      ++ optionals useCuda [
+        (
+          with cudaPackages.flags;
+          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
+          )
+        )
+      ]
+      ++ optionals useRocm [
+        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
+        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
+
+        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+        # and select the line that matches the current nixpkgs version of rocBLAS.
+        # Should likely use `rocmPackages.clr.gpuTargets`.
+        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+      ]
+      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
+      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
+
+    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+    # if they haven't been added yet.
+    postInstall = ''
+      mv $out/bin/main $out/bin/llama
+      mv $out/bin/server $out/bin/llama-server
+      mkdir -p $out/include
+      cp $src/llama.h $out/include/
+    '';
+
+    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
+    passthru = {
+      inherit
+        useBlas
+        useCuda
+        useMetalKit
+        useMpi
+        useOpenCL
+        useRocm
+        ;
+
+      shell = mkShell {
+        name = "shell-${finalAttrs.finalPackage.name}";
+        description = "contains numpy and sentencepiece";
+        buildInputs = [ llama-python ];
+        inputsFrom = [ finalAttrs.finalPackage ];
+      };
+
+      shell-extra = mkShell {
+        name = "shell-extra-${finalAttrs.finalPackage.name}";
+        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
+        buildInputs = [ llama-python-extra ];
+        inputsFrom = [ finalAttrs.finalPackage ];
+      };
+    };
+
+    meta = {
+      # Configurations we don't want even the CI to evaluate. Results in the
+      # "unsupported platform" messages. This is mostly a no-op, because
+      # cudaPackages would've refused to evaluate anyway.
+      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
+
+      # Configurations that are known to result in build failures. Can be
+      # overridden by importing Nixpkgs with `allowBroken = true`.
+      broken = (useMetalKit && !effectiveStdenv.isDarwin);
+
+      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+      homepage = "https://github.com/ggerganov/llama.cpp/";
+      license = lib.licenses.mit;
+
+      # Accommodates `nix run` and `lib.getExe`
+      mainProgram = "llama";
+
+      # These people might respond, on the best effort basis, if you ping them
+      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+      # Consider adding yourself to this list if you want to ensure this flake
+      # stays maintained and you're willing to invest your time. Do not add
+      # other people without their consent. Consider removing people after
+      # they've been unreachable for long periods of time.
+
+      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+      # an attrset following the same format as in
+      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+      maintainers = with lib.maintainers; [
+        philiptaron
+        SomeoneSerge
+      ];
+
+      # Extend `badPlatforms` instead
+      platforms = lib.platforms.all;
+    };
+  }
+)
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -0,0 +1,12 @@
+{
+  lib,
+  newScope,
+  llamaVersion ? "0.0.0",
+}:
+
+lib.makeScope newScope (
+  self: {
+    inherit llamaVersion;
+    llama-cpp = self.callPackage ./package.nix { };
+  }
+)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -515,7 +515,6 @@ jobs:
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build

-
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -0,0 +1,112 @@
+name: Nix CI
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
+
+jobs:
+  nix-eval:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Install Nix
+      uses: DeterminateSystems/nix-installer-action@v9
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        extra-conf: |
+          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+    - uses: DeterminateSystems/magic-nix-cache-action@v2
+      with:
+        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
+    - name: List all flake outputs
+      run: nix flake show --all-systems
+    - name: Show all output paths
+      run: >
+          nix run github:nix-community/nix-eval-jobs
+          -- --gc-roots-dir gcroot
+          --flake
+          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
+  nix-build:
+    if: ${{ vars.CACHIX_NAME != '' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Install Nix
+      uses: DeterminateSystems/nix-installer-action@v9
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        extra-conf: |
+          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+    - uses: DeterminateSystems/magic-nix-cache-action@v2
+      with:
+        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
+    - name: Set-up cachix to push the results to
+      uses: cachix/cachix-action@v13
+      with:
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+        name: ${{ vars.CACHIX_NAME }}
+    - name: Build
+      run: >
+          nix run github:Mic92/nix-fast-build
+          -- --skip-cached --no-nom
+          --flake
+          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
+  nix-build-aarch64:
+    if: ${{ vars.CACHIX_NAME != '' }}
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+    - name: Install QEMU
+      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
+      run: |
+        sudo apt-get install -y qemu-user-static qemu-system-aarch64
+        sudo usermod -a -G kvm $USER
+    - name: Install Nix
+      uses: DeterminateSystems/nix-installer-action@v9
+      with:
+        github-token: ${{ secrets.GITHUB_TOKEN }}
+        extra-conf: |
+          extra-platforms = aarch64-linux
+          extra-system-features = nixos-test kvm
+          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+    - uses: DeterminateSystems/magic-nix-cache-action@v2
+      with:
+        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
+    - name: Set-up cachix to push the results to
+      uses: cachix/cachix-action@v13
+      with:
+        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
+        name: ${{ vars.CACHIX_NAME }}
+    - name: Show all output paths
+      run: >
+          nix run github:nix-community/nix-eval-jobs
+          -- --gc-roots-dir gcroot
+          --flake
+          ".#packages.aarch64-linux"
+    - name: Build
+      run: >
+          nix run github:Mic92/nix-fast-build
+          -- --skip-cached --no-nom
+          --systems aarch64-linux
+          --flake
+          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@ -0,0 +1,22 @@
+name: update-flake-lock
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
+
+jobs:
+  lockfile:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Install Nix
+        uses: DeterminateSystems/nix-installer-action@main
+      - name: Update flake.lock
+        uses: DeterminateSystems/update-flake-lock@main
+        with:
+          pr-title: "nix: update flake.lock"
+          pr-labels: |
+            nix
+          pr-reviewers: philiptaron,SomeoneSerge
+          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@ -0,0 +1,36 @@
+# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
+name: "Publish a flake to flakestry & flakehub"
+on:
+    push:
+        tags:
+        - "*"
+    workflow_dispatch:
+        inputs:
+            tag:
+                description: "The existing tag to publish"
+                type: "string"
+                required: true
+jobs:
+    flakestry-publish:
+        runs-on: ubuntu-latest
+        permissions:
+            id-token: "write"
+            contents: "read"
+        steps:
+            - uses: flakestry/flakestry-publish@main
+              with:
+                version: "${{ inputs.tag || github.ref_name }}"
+    flakehub-publish:
+      runs-on: "ubuntu-latest"
+      permissions:
+        id-token: "write"
+        contents: "read"
+      steps:
+        - uses: "actions/checkout@v4"
+          with:
+            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
+        - uses: "DeterminateSystems/nix-installer-action@main"
+        - uses: "DeterminateSystems/flakehub-push@main"
+          with:
+            visibility: "public"
+            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@ -0,0 +1,29 @@
+name: Python check requirements.txt
+
+on:
+  push:
+    paths:
+      - 'scripts/check-requirements.sh'
+      - 'convert*.py'
+      - 'requirements.txt'
+      - 'requirements/*.txt'
+  pull_request:
+    paths:
+      - 'scripts/check-requirements.sh'
+      - 'convert*.py'
+      - 'requirements.txt'
+      - 'requirements/*.txt'
+
+jobs:
+  python-check-requirements:
+    runs-on: ubuntu-latest
+    name: check-requirements
+    steps:
+      - name: Check out source repository
+        uses: actions/checkout@v3
+      - name: Set up Python environment
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+      - name: Run check-requirements.sh script
+        run:  bash scripts/check-requirements.sh nocleanup
--- a/README.md
+++ b/README.md
@ -103,6 +103,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
+- [x] [GPT-2](https://huggingface.co/gpt2)

 **Multimodal models:**

@ -384,17 +385,31 @@ Building the program with BLAS support may lead to some performance improvements

  Check [BLIS.md](docs/BLIS.md) for more information.

- #### Intel MKL
+- #### Intel oneMKL
+  - Using manual oneAPI installation:
+    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
+      ```bash
+      mkdir build
+      cd build
+      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-runtime docker image, only required for manual installation
+      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
+      cmake --build . --config Release
+      ```

-  By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
+  - Using oneAPI docker image:
+    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)

      ```bash
      mkdir build
      cd build
-  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
      cmake --build . --config Release
      ```

+  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
+
+  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
+
 - #### cuBLAS

  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -65,4 +65,4 @@ endif()

 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama build_info)
+target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1394,6 +1394,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
+    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -182,6 +182,8 @@ class Model:
            return QwenModel
        if model_architecture == "MixtralForCausalLM":
            return MixtralModel
+        if model_architecture == "GPT2LMHeadModel":
+            return GPT2Model
        if model_architecture == "PhiForCausalLM":
            return Phi2Model
        if model_architecture == "PlamoForCausalLM":
@ -225,6 +227,8 @@ class Model:
            return gguf.MODEL_ARCH.QWEN
        if arch == "MixtralForCausalLM":
            return gguf.MODEL_ARCH.LLAMA
+        if arch == "GPT2LMHeadModel":
+            return gguf.MODEL_ARCH.GPT2
        if arch == "PhiForCausalLM":
            return gguf.MODEL_ARCH.PHI2
        if arch == "PlamoForCausalLM":
@ -238,7 +242,7 @@ class Model:
        tokens: list[bytearray] = []
        toktypes: list[int] = []

-        from transformers import AutoTokenizer  # type: ignore[attr-defined]
+        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model)
        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
        assert max(tokenizer.vocab.values()) < vocab_size
@ -852,7 +856,7 @@ class StableLMModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]

-        self.gguf_writer.add_name(dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@ -898,7 +902,7 @@ class QwenModel(Model):
        tokens: list[bytearray] = []
        toktypes: list[int] = []

-        from transformers import AutoTokenizer  # type: ignore[attr-defined]
+        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams["vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
@ -993,6 +997,68 @@ class QwenModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


+class GPT2Model(Model):
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
+                continue
+
+            if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
+                data_torch = data_torch.transpose(1, 0)
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+            # note: GPT2 output is tied to (same as) wte in original model
+            if new_name == "token_embd.weight":
+                print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+                self.gguf_writer.add_tensor("output.weight", data)
+
+
 class Phi2Model(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layer"]
@ -1119,6 +1185,7 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


+def main() -> None:
    args = parse_args()

    dir_model = args.model
@ -1173,3 +1240,7 @@ with torch.inference_mode():
            model_instance.write()

        print(f"Model successfully exported to '{fname_out}'")
+
+
+if __name__ == '__main__':
+    main()
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -47,6 +47,7 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
    fout.seek((fout.tell() + 31) & -32)


+if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(f"Usage: python {sys.argv[0]} <path> [arch]")
        print(
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import torch
 import os
 from pprint import pprint
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -1,5 +1,7 @@
 import Foundation

+// To use this in your own project, add llama.cpp as a swift package dependency
+// and uncomment this import line.
 // import llama

 enum LlamaError: Error {
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -4,6 +4,7 @@ import Foundation
 class LlamaState: ObservableObject {
    @Published var messageLog = ""
    @Published var cacheCleared = false
+    let NS_PER_S = 1_000_000_000.0

    private var llamaContext: LlamaContext?
    private var defaultModelUrl: URL? {
@ -20,12 +21,12 @@ class LlamaState: ObservableObject {
    }

    func loadModel(modelUrl: URL?) throws {
-        messageLog += "Loading model...\n"
        if let modelUrl {
+            messageLog += "Loading model...\n"
            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
        } else {
-            messageLog += "Could not locate model\n"
+            messageLog += "Load a model from the list below\n"
        }
    }

@ -34,15 +35,29 @@ class LlamaState: ObservableObject {
            return
        }

+        let t_start = DispatchTime.now().uptimeNanoseconds
        await llamaContext.completion_init(text: text)
+        let t_heat_end = DispatchTime.now().uptimeNanoseconds
+        let t_heat = Double(t_heat_end - t_start) / NS_PER_S
+
        messageLog += "\(text)"

-        while await llamaContext.n_cur <= llamaContext.n_len {
+        while await llamaContext.n_cur < llamaContext.n_len {
            let result = await llamaContext.completion_loop()
            messageLog += "\(result)"
        }
+
+        let t_end = DispatchTime.now().uptimeNanoseconds
+        let t_generation = Double(t_end - t_heat_end) / NS_PER_S
+        let tokens_per_second = Double(await llamaContext.n_len) / t_generation
+
        await llamaContext.clear()
-        messageLog += "\n\ndone\n"
+        messageLog += """
+            \n
+            Done
+            Heat up took \(t_heat)s
+            Generated \(tokens_per_second) t/s\n
+            """
    }

    func bench() async {
@ -56,10 +71,10 @@ class LlamaState: ObservableObject {
        messageLog += await llamaContext.model_info() + "\n"

        let t_start = DispatchTime.now().uptimeNanoseconds
-        await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
+        let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
        let t_end = DispatchTime.now().uptimeNanoseconds

-        let t_heat = Double(t_end - t_start) / 1_000_000_000.0
+        let t_heat = Double(t_end - t_start) / NS_PER_S
        messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"

        // if more than 5 seconds, then we're probably running on a slow device
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@ -42,46 +42,27 @@ struct ContentView: View {
                Button("Send") {
                    sendText()
                }
-                .padding(8)
-                .background(Color.blue)
-                .foregroundColor(.white)
-                .cornerRadius(8)

                Button("Bench") {
                    bench()
                }
-                .padding(8)
-                .background(Color.blue)
-                .foregroundColor(.white)
-                .cornerRadius(8)

                Button("Clear") {
                    clear()
                }
-                .padding(8)
-                .background(Color.blue)
-                .foregroundColor(.white)
-                .cornerRadius(8)

                Button("Copy") {
                    UIPasteboard.general.string = llamaState.messageLog
                }
-                .padding(8)
-                .background(Color.blue)
-                .foregroundColor(.white)
-                .cornerRadius(8)
-            }
+            }.buttonStyle(.bordered)

-            VStack {
+            VStack(alignment: .leading) {
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
                )
-                .font(.system(size: 12))
-                .padding(.top, 4)
-                .frame(maxWidth: .infinity, alignment: .leading)

                DownloadButton(
                    llamaState: llamaState,
@ -89,7 +70,6 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
                )
-                .font(.system(size: 12))

                DownloadButton(
                    llamaState: llamaState,
@ -97,8 +77,6 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
                    filename: "tinyllama-1.1b-f16.gguf"
                )
-                .font(.system(size: 12))
-                .frame(maxWidth: .infinity, alignment: .leading)

                DownloadButton(
                    llamaState: llamaState,
@ -106,7 +84,6 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
                    filename: "phi-2-q4_0.gguf"
                )
-                .font(.system(size: 12))

                DownloadButton(
                    llamaState: llamaState,
@ -114,8 +91,6 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
                    filename: "phi-2-q8_0.gguf"
                )
-                .font(.system(size: 12))
-                .frame(maxWidth: .infinity, alignment: .leading)

                DownloadButton(
                    llamaState: llamaState,
@ -123,15 +98,15 @@ struct ContentView: View {
                    modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
                    filename: "mistral-7b-v0.1.Q4_0.gguf"
                )
-                .font(.system(size: 12))

                Button("Clear downloaded models") {
                    ContentView.cleanupModelCaches()
                    llamaState.cacheCleared = true
                }
-                .padding(8)
-                .font(.system(size: 12))
            }
+            .padding(.top, 4)
+            .font(.system(size: 12))
+            .frame(maxWidth: .infinity, alignment: .leading)
        }
        .padding()
    }
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
@ -93,7 +93,7 @@ struct DownloadButton: View {
                        print("Error: \(err.localizedDescription)")
                    }
                }) {
-                    Text("\(modelName) (Downloaded)")
+                    Text("Load \(modelName)")
                }
            } else {
                Text("Unknown status")
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -25,6 +25,7 @@ endif()
 if (NOT MSVC)
    target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
+
 if(TARGET BUILD_INFO)
    add_dependencies(llava BUILD_INFO)
 endif()
@ -32,5 +33,5 @@ endif()
 set(TARGET llava-cli)
 add_executable(llava-cli llava-cli.cpp)
 install(TARGETS llava-cli RUNTIME)
-target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(llava PRIVATE cxx_std_11)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -16,12 +16,19 @@
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif

 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

-#define CLIP_DEBUG
-
 static std::string format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@ -139,6 +146,27 @@ static std::string get_ftype(int ftype) {
    }
 }

+//
+// image data
+//
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 //
 // clip layers
 //
@ -196,39 +224,31 @@ struct clip_vision_model {
    struct ggml_tensor * mm_2_b;
 };

-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct clip_buffer {
-    uint8_t * data = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        delete[] data;
-        data = new uint8_t[size];
-        this->size = size;
-    }
-
-    ~clip_buffer() { delete[] data; }
-};
-
 struct clip_ctx {
    bool has_text_encoder    = false;
    bool has_vision_encoder  = false;
    bool has_llava_projector = false;
+
    struct clip_vision_model vision_model;
+
    float image_mean[3];
    float image_std[3];
    bool use_gelu = false;
    int32_t ftype = 1;
-    struct ggml_context * ctx;
+
    struct gguf_context * ctx_gguf;
+    struct ggml_context * ctx_data;
+
+    std::vector<uint8_t> buf_compute_meta;

    // memory buffers to evaluate the model
-    clip_buffer buf_compute;
-    clip_buffer buf_alloc;
-    ggml_allocr * alloc = NULL;
+    ggml_backend_buffer_t params_buffer = NULL;
+    ggml_backend_buffer_t compute_buffer = NULL;
+    ggml_backend_t backend = NULL;
+    ggml_allocr * compute_alloc = NULL;
 };

-static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return nullptr;
@ -253,24 +273,20 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        GGML_ASSERT(batch_size == 1);
    }

-    const auto & buf_compute = ctx->buf_compute;
-
    struct ggml_init_params params = {
-        /*.mem_size =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.data,
-        /*.no_alloc =*/ false,
+        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
+        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
    };

-    params.no_alloc = true;
-
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);

    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
-    ggml_allocr_alloc(ctx->alloc, inp_raw);
+    ggml_allocr_alloc(ctx->compute_alloc, inp_raw);

-    if (!ggml_allocr_is_measure(ctx->alloc)) {
-        float * data = (float *)ggml_get_data(inp_raw);
+    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
+        float * data = (float *)malloc(ggml_nbytes(inp_raw));

        for (size_t i = 0; i < imgs->size; i++) {
            const int nx = imgs->data[i].nx;
@ -283,12 +299,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
                for (int k = 0; k < 3; k++) {
                    for (int y = 0; y < ny; y++) {
                        for (int x = 0; x < nx; x++) {
-                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
                        }
                    }
                }
            }
        }
+        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
+        free(data);
    }

    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
@ -298,36 +316,39 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima

    // concat class_embeddings and patch_embeddings
    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-    ggml_allocr_alloc(ctx->alloc, embeddings);
-    if (!ggml_allocr_is_measure(ctx->alloc)) {
-        ggml_set_zero(embeddings);
+    ggml_allocr_alloc(ctx->compute_alloc, embeddings);
+    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
+        void* zero_mem = malloc(ggml_nbytes(embeddings));
+        memset(zero_mem, 0, ggml_nbytes(embeddings));
+        ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
+        free(zero_mem);
    }

-    struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
-    ggml_allocr_alloc(ctx->alloc, temp);
+    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);

-    embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1],
-                          embeddings->nb[2], embeddings->nb[3], 0);
-    embeddings =
-        ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+    embeddings = ggml_acc(ctx0, embeddings, inp,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);

    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
-    ggml_allocr_alloc(ctx->alloc, positions);
-    if (!ggml_allocr_is_measure(ctx->alloc)) {
+    ggml_allocr_alloc(ctx->compute_alloc, positions);
+    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
+        int* positions_data = (int*)malloc(ggml_nbytes(positions));
        for (int i = 0; i < num_positions; i++) {
-            ggml_set_i32_1d(positions, i, i);
+            positions_data[i] = i;
        }
+        ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
+        free(positions_data);
    }

    embeddings =
-        ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));
+        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));

    // pre-layernorm
    {
        embeddings = ggml_norm(ctx0, embeddings, eps);

-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings),
-                              ggml_repeat(ctx0, model.pre_ln_b, embeddings));
+        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
    }

    // loop over layers
@ -340,15 +361,15 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        {
            cur = ggml_norm(ctx0, cur, eps);

-            cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur),
-                           ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
+                           model.layers[il].ln_1_b);
        }

        // self-attention
        {

            struct ggml_tensor * Q =
-                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur));
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);

            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
@ -356,14 +377,14 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);

            struct ggml_tensor * K =
-                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), ggml_mul_mat(ctx0, model.layers[il].k_w, cur));
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);

            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);

            struct ggml_tensor * V =
-                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), ggml_mul_mat(ctx0, model.layers[il].v_w, cur));
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);

            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
@ -379,7 +400,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        }

        // attention output
-        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), ggml_mul_mat(ctx0, model.layers[il].o_w, cur));
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);

        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, embeddings);
@ -390,12 +411,11 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        {
            cur = ggml_norm(ctx0, cur, eps);

-            cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur),
-                           ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
        }

        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);

        if (ctx->use_gelu) {
            cur = ggml_gelu_inplace(ctx0, cur);
@ -404,7 +424,7 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        }

        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);

        // residual 2
        cur = ggml_add(ctx0, embeddings, cur);
@ -417,23 +437,26 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);

        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
-        ggml_allocr_alloc(ctx->alloc, patches);
-        if (!ggml_allocr_is_measure(ctx->alloc)) {
-            for (int i = 0; i < num_patches; ++i) {
-                ggml_set_i32_1d(patches, i, i+1);
+        ggml_allocr_alloc(ctx->compute_alloc, patches);
+        if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
+            int* patches_data = (int*)malloc(ggml_nbytes(patches));
+            for (int i = 0; i < num_patches; i++) {
+                patches_data[i] = i + 1;
            }
+            ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+            free(patches_data);
        }

        embeddings = ggml_get_rows(ctx0, embeddings, patches);

        // mm projection 0
        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_0_b, embeddings), embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);

        embeddings = ggml_gelu(ctx0, embeddings);

        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
    }

    // build the graph
@ -446,7 +469,6 @@ static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_ima

 // read and create ggml_context containing the tensors and their data
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
-
    struct ggml_context * meta = NULL;

    struct gguf_init_params params = {
@ -479,7 +501,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
        printf("\n");
    }
-
+    const int n_tensors = gguf_get_n_tensors(ctx);
    // kv
    if (verbosity >= 3) {
        const int n_kv = gguf_get_n_kv(ctx);
@ -493,27 +515,38 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    }

    // data
-    size_t ctx_size = 0;
+    size_t buffer_size = 0;
    {
-        const int n_tensors = gguf_get_n_tensors(ctx);
-
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
-
            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
-            ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
            size_t tensor_size = ggml_nbytes(cur);
-            size_t padded_size = ggml_nbytes_pad(cur);
-            ctx_size += padded_size;
+            buffer_size += tensor_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
+                       ggml_n_dims(cur), cur->name, tensor_size, offset);
            }
        }
    }

+    buffer_size += n_tensors * 128 /* CLIP PADDING */;
+
    clip_ctx * new_clip = new clip_ctx;
+#ifdef GGML_USE_CUBLAS
+    new_clip->backend = ggml_backend_cuda_init(0);
+    printf("%s: CLIP using CUDA backend\n", __func__);
+#endif
+
+#ifdef GGML_USE_METAL
+    new_clip->backend = ggml_backend_metal_init();
+    printf("%s: CLIP using Metal backend\n", __func__);
+#endif
+
+    if (!new_clip->backend) {
+        new_clip->backend = ggml_backend_cpu_init();
+        printf("%s: CLIP using CPU backend\n", __func__);
+    }

    // model size and capabilities
    {
@ -539,21 +572,24 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            printf("%s: model size:     %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0));
+            printf("%s: model size:     %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0);
            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }

+    printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors);
+
    // load tensors
    {
+        std::vector<uint8_t> read_buf;
        struct ggml_init_params params = {
-            /*.mem_size =*/ ctx_size,
+            /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
            /*.mem_buffer =*/ NULL,
-            /*.no_alloc =*/ false,
+            /*.no_alloc =*/ true,
        };

-        new_clip->ctx = ggml_init(params);
-        if (!new_clip->ctx) {
+        new_clip->ctx_data = ggml_init(params);
+        if (!new_clip->ctx_data) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            return nullptr;
@ -566,13 +602,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            return nullptr;
        }

-        const int n_tensors = gguf_get_n_tensors(ctx);
+        // add tensors to context
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor * t = ggml_get_tensor(meta, name);
-            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
+            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
            ggml_set_name(cur, name);
+        }

+        // alloc memory and offload data
+        new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size);
+        ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name = gguf_get_tensor_name(ctx, i);
+            struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
+            ggml_allocr_alloc(alloc, cur);
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
@ -580,10 +624,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                clip_free(new_clip);
                return nullptr;
            }
-
-            fin.read(reinterpret_cast<char *>(cur->data), ggml_nbytes(t));
+            int num_bytes = ggml_nbytes(cur);
+            if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
+                // for the CPU and Metal backend, we can read directly into the tensor
+                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
+            } else {
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(num_bytes);
+                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
+                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
            }
-
+        }
+        ggml_allocr_free(alloc);
        fin.close();
    }

@ -619,35 +671,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            printf("v_n_layer          %d\n", hparams.n_layer);
        }

-        vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
-        vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
-        vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
-        vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
-        vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
-        vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
-        vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
-        vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
-        vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));
+        vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+        vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));

        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
            auto & layer = vision_model.layers[il];
-            layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
-            layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
-            layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
-            layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
-            layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
-            layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
-            layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
-            layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
-            layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
-            layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
-            layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
-            layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
-            layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
-            layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
-            layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
-            layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
+            layer.k_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "weight"));
+            layer.q_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "weight"));
+            layer.v_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "weight"));
+            layer.o_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
+            layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "weight"));
+            layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "weight"));
+            layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "weight"));
+            layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "weight"));
+            layer.k_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "bias"));
+            layer.q_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "bias"));
+            layer.v_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "bias"));
+            layer.o_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
+            layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "bias"));
+            layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "bias"));
+            layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "bias"));
+            layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "bias"));
        }
    }

@ -657,43 +709,43 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

    // measure mem requirement and allocate
    {
-        static const size_t tensor_alignment = 32;
-        new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
-        new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
+        new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
+        new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
        clip_image_f32_batch batch;
        batch.size = 1;
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
-        size_t alloc_size = ggml_allocr_alloc_graph(new_clip->alloc, gf) + tensor_alignment;
-        ggml_allocr_free(new_clip->alloc);
-        new_clip->buf_alloc.resize(alloc_size);
-        new_clip->alloc = ggml_allocr_new(new_clip->buf_alloc.data, new_clip->buf_alloc.size, tensor_alignment);
+        size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf);
+        ggml_allocr_free(new_clip->compute_alloc);
+        new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size);
+        new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer);

-        printf("%s: total allocated memory: %.2f MB\n", __func__, (new_clip->buf_compute.size + alloc_size)/1024.0/1024.0);
+        printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }

    return new_clip;
 }

-clip_image_u8 * make_clip_image_u8() {
-    auto img = new clip_image_u8();
-    return img;
+struct clip_image_u8 * clip_image_u8_init() {
+    return new clip_image_u8();
 }
-clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }

-void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
-void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }
+struct clip_image_f32 * clip_image_f32_init() {
+    return new clip_image_f32();
+}
+
+void clip_image_u8_free (struct clip_image_u8  * img) { delete img; }
+void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }

 static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
    img->nx = nx;
    img->ny = ny;
-    img->size = nx * ny * 3;
-    img->data = new uint8_t[img->size]();
-    memcpy(img->data, data, img->size);
+    img->buf.resize(3 * nx * ny);
+    memcpy(img->buf.data(), data, img->buf.size());
 }

 bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
-    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
+    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
        return false;
@ -705,7 +757,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {

 bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
    int nx, ny, nc;
-    auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
+    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
        return false;
@ -717,7 +769,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length

 // normalize: x = (x - mean) / std
 // TODO: implement bicubic interpolation instead of linear.
-bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@ -726,18 +778,17 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156

-    clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
+    clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
    if (pad2square && img->nx != img->ny) {
        int longer_side = std::max(img->nx, img->ny);
        temp->nx = longer_side;
        temp->ny = longer_side;
-        temp->size = 3 * longer_side * longer_side;
-        temp->data = new uint8_t[temp->size]();
-        uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
+        temp->buf.resize(3 * longer_side * longer_side);
+        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA

        // fill with background color
-        for (size_t i = 0; i < temp->size; i++) {
-            temp->data[i] = bc[i % 3];
+        for (size_t i = 0; i < temp->buf.size(); i++) {
+            temp->buf[i] = bc[i % 3];
        }

        // copy from the input image
@ -745,17 +796,16 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
            for (int x = 0; x < img->nx; x++) {
                const int i = 3 * (y * img->nx + x);
                const int j = 3 * (y * temp->nx + x);
-                temp->data[j] = img->data[i];
-                temp->data[j+1] = img->data[i+1];
-                temp->data[j+2] = img->data[i+2];
+                temp->buf[j]   = img->buf[i];
+                temp->buf[j+1] = img->buf[i+1];
+                temp->buf[j+2] = img->buf[i+2];
            }
        }
    } else {
        temp->nx = img->nx;
        temp->ny = img->ny;
-        temp->size = img->size;
-        temp->data = new uint8_t[temp->size]();
-        memcpy(&temp->data[0], &img->data[0], temp->size); // copy
+        temp->buf.resize(img->buf.size());
+        memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
    }

    const int nx = temp->nx;
@ -766,8 +816,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip

    res->nx = nx2;
    res->ny = ny2;
-    res->size = 3 * nx2 * ny2;
-    res->data = new float[res->size]();
+    res->buf.resize(3 * nx2 * ny2);

    const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;

@ -798,10 +847,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
                const int j10 = 3 * (y1 * nx + x0) + c;
                const int j11 = 3 * (y1 * nx + x1) + c;

-                const float v00 = temp->data[j00];
-                const float v01 = temp->data[j01];
-                const float v10 = temp->data[j10];
-                const float v11 = temp->data[j11];
+                const float v00 = temp->buf[j00];
+                const float v01 = temp->buf[j01];
+                const float v10 = temp->buf[j10];
+                const float v11 = temp->buf[j11];

                const float v0 = v00 * (1.0f - dx) + v01 * dx;
                const float v1 = v10 * (1.0f - dx) + v11 * dx;
@ -812,7 +861,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip

                const int i = 3 * (y * nx3 + x) + c;

-                res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
+                res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
            }
        }
    }
@ -822,12 +871,13 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
 }

 void clip_free(clip_ctx * ctx) {
-    ggml_free(ctx->ctx);
+    ggml_free(ctx->ctx_data);
    gguf_free(ctx->ctx_gguf);
+
    delete ctx;
 }

-bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@ -839,8 +889,7 @@ bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32
    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }

-bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
-
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@ -852,29 +901,29 @@ bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const cl
    }

    // reset alloc buffer to clean the memory from previous invocations
-    ggml_allocr_reset(ctx->alloc);
+    ggml_allocr_reset(ctx->compute_alloc);

    // build the inference graph
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
-    ggml_allocr_alloc_graph(ctx->alloc, gf);
+    ggml_allocr_alloc_graph(ctx->compute_alloc, gf);

-    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
-    if (plan.work_size > 0) {
-        plan.work_data = (uint8_t *)malloc(plan.work_size);
+    if (ggml_backend_is_cpu(ctx->backend)) {
+        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
    }

-    ggml_graph_compute(gf, &plan);
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(ctx->backend)) {
+        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
+    }
+#endif
+
+    ggml_backend_graph_compute(ctx->backend, gf);

    // the last node is the embedding tensor
    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];

    // copy the embeddings to the location passed by the user
-    memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));
-
-    if (plan.work_size > 0) {
-        free(plan.work_data);
-    }
-
+    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
    return true;
 }

@ -903,11 +952,12 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            return false;
    };

-    auto ctx_clip = clip_model_load(fname_inp, 2);
-    const auto & ctx_src = ctx_clip->ctx_gguf;
-    const auto & ctx_data = ctx_clip->ctx;
+    auto * ctx_clip = clip_model_load(fname_inp, 2);

-    auto ctx_out = gguf_init_empty();
+    const auto & ctx_src = ctx_clip->ctx_gguf;
+    const auto & ctx_data = ctx_clip->ctx_data;
+
+    auto * ctx_out = gguf_init_empty();
    gguf_set_kv(ctx_out, ctx_src);
    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
    gguf_set_val_u32(ctx_out, "general.file_type", itype);
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -35,31 +35,14 @@ struct clip_vision_hparams {
    float eps;
 };

-/** load mmproj model */
-CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
-/** free mmproj model */
+CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
+
 CLIP_API void clip_free(struct clip_ctx * ctx);

-size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-int clip_n_patches(const struct clip_ctx * ctx);
-int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);

-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-    uint8_t * data = NULL;
-    size_t size;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-    float * data = NULL;
-    size_t size;
-};
+CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);

 struct clip_image_u8_batch {
    struct clip_image_u8 * data;
@ -71,21 +54,22 @@ struct clip_image_f32_batch {
    size_t size;
 };

-struct clip_image_u8 * make_clip_image_u8();
-struct clip_image_f32 * make_clip_image_f32();
-CLIP_API void clip_image_u8_free(clip_image_u8 * img);
-CLIP_API void clip_image_f32_free(clip_image_f32 * img);
+CLIP_API struct clip_image_u8  * clip_image_u8_init ();
+CLIP_API struct clip_image_f32 * clip_image_f32_init();
+
+CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
+CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
+
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);

-bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
-bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
+CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
+CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);

-bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
-                             float * vec);
-
-bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
+CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

 #ifdef __cplusplus
 }
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -39,73 +39,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
    return true;
 }

-// TODO: use common/sampling.h
-static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
-    auto & sparams = params.sparams;
-
-    // out of user input, sample next token
-    const float   temp      = sparams.temp;
-    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
-    const float   top_p     = sparams.top_p;
-    const float   tfs_z     = sparams.tfs_z;
-    const float   typical_p = sparams.typical_p;
-    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
-    // const float   repeat_penalty  = sparams.repeat_penalty;
-    // const float   alpha_presence  = sparams.presence_penalty;
-    // const float   alpha_frequency = sparams.frequency_penalty;
-    const int     mirostat     = sparams.mirostat;
-    const float   mirostat_tau = sparams.mirostat_tau;
-    const float   mirostat_eta = sparams.mirostat_eta;
-    // const bool    penalize_nl     = sparams.penalize_nl;
-
-    llama_token id = 0;
-    {
-        auto logits  = llama_get_logits(ctx_llama);
-        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
-
-        // Apply params.logit_bias map
-        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
-            logits[it->first] += it->second;
-        }
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-        if (temp <= 0) {
-              // Greedy sampling
-            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
-        } else {
-            if (mirostat == 1) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                const  int mirostat_m    = 100;
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-            } else if (mirostat == 2) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-            } else {
-                  // Temperature sampling
-                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
-                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
-                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
-                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token(ctx_llama, &candidates_p);
-            }
-        }
-    }
-
-    return id;
-}
-
-static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
-    int id = sample_id(ctx_llama, params);
+static const char * sample(struct llama_sampling_context * ctx_sampling,
+                           struct llama_context * ctx_llama,
+                           int * n_past) {
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    static std::string ret;
    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
        ret = "</s>";
@ -174,8 +112,8 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    fprintf(stderr, "  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
@ -185,7 +123,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
    auto prompt = params->prompt;
    if (prompt_contains_image(prompt)) {
        if (!params->image.empty()) {
-            printf("using base64 encoded image instead of command line image path\n");
+            fprintf(stderr, "using base64 encoded image instead of command line image path\n");
        }
        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
        if (!embed) {
@ -217,16 +155,19 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    // generate the response

-    printf("\n");
+    fprintf(stderr, "\n");
+
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);

    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
+        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
        if (strcmp(tmp, "</s>") == 0) break;

        printf("%s", tmp);
        fflush(stdout);
    }

+    llama_sampling_free(ctx_sampling);
    printf("\n");
 }

--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -10,7 +10,7 @@
 #include "base64.hpp"

 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    clip_image_f32 * img_res = make_clip_image_f32();
+    clip_image_f32 * img_res = clip_image_f32_init();
    if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
        clip_image_f32_free(img_res);
@ -86,7 +86,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
 }

 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
-    clip_image_u8 * img = make_clip_image_u8();
+    clip_image_u8 * img = clip_image_u8_init();
    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
        clip_image_u8_free(img);
        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@ -7,28 +7,13 @@ find_package(Llama 0.0.1 REQUIRED)
 # Bake common functionality in with target. Because applications
 # using the relocatable Llama package should be outside of the
 # source tree, main-cmake-pkg pretends the dependencies are built-in.
-
 set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
-add_library(common OBJECT
-    ${_common_path}/common.h
-    ${_common_path}/common.cpp
-    ${_common_path}/console.h
-    ${_common_path}/console.cpp
-    ${_common_path}/grammar-parser.h
-    ${_common_path}/grammar-parser.cpp
-    ${_common_path}/sampling.h
-    ${_common_path}/sampling.cpp
+add_library(common OBJECT)
+file(GLOB _common_files
+    "${_common_path}/*.h"
+    "${_common_path}/*.cpp"
 )
-
-# WARNING: because build-info.h is auto-generated, it will only
-# be available after the user has built the llama.cpp sources.
-#
-configure_file(${_common_path}/../build-info.h
-    ${CMAKE_CURRENT_BINARY_DIR}/build-info.h
-    COPYONLY)
-
-target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
-    ${CMAKE_CURRENT_BINARY_DIR})
+target_sources(common PRIVATE ${_common_files})

 # If the common project was part of "main-cmake-pkg" the transient
 # defines would automatically be attached. Because the common func-
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -166,7 +166,7 @@ node index.js

    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)

-    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

    *Result JSON:*

@ -224,6 +224,8 @@ node index.js

    `content`: Set the text to process.

+    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+
 -   **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.

    *Options:*
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -25,6 +25,7 @@
 #include <thread>
 #include <mutex>
 #include <chrono>
+#include <condition_variable>

 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@ -81,7 +82,7 @@ static inline bool is_base64(uint8_t c)
    return (isalnum(c) || (c == '+') || (c == '/'));
 }

-static std::vector<uint8_t> base64_decode(std::string const &encoded_string)
+static std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 {
    int i = 0;
    int j = 0;
@ -211,7 +212,7 @@ struct slot_image
    float * image_embedding = nullptr;
    int32_t image_tokens = 0;

-    clip_image_u8 img_data;
+    clip_image_u8 * img_data;

    std::string prefix_prompt; // before of this image
 };
@ -436,12 +437,13 @@ struct llama_client_slot
        for (slot_image & img : images)
        {
            free(img.image_embedding);
-            delete[] img.img_data.data;
+            if (img.img_data) {
+                clip_image_u8_free(img.img_data);
+            }
            img.prefix_prompt = "";
        }

        images.clear();
-        // llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
    }

    bool has_budget(gpt_params &global_params) {
@ -542,7 +544,9 @@ struct llama_server_context
    std::vector<task_result> queue_results;
    std::vector<task_multi>  queue_multitasks;
    std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
+    std::condition_variable condition_tasks;
    std::mutex mutex_results;
+    std::condition_variable condition_results;

    ~llama_server_context()
    {
@ -849,24 +853,17 @@ struct llama_server_context
            {
                for (const auto &img : *images_data)
                {
-                    std::string data_b64 = img["data"].get<std::string>();
+                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
+
                    slot_image img_sl;
                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-                    int width, height, channels;
-                    std::vector<uint8_t> image_buffer = base64_decode(data_b64);
-                    data_b64.clear();
-                    auto data = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &width, &height, &channels, 3);
-                    if (!data) {
+                    img_sl.img_data = clip_image_u8_init();
+                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
+                    {
                        LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
                        return false;
                    }
-                    LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", slot->id, img_sl.id, width, height);
-                    img_sl.img_data.nx = width;
-                    img_sl.img_data.ny = height;
-                    img_sl.img_data.size = width * height * 3;
-                    img_sl.img_data.data = new uint8_t[width * height * 3]();
-                    memcpy(img_sl.img_data.data, data, width * height * 3);
-                    stbi_image_free(data);
+                    LOG_TEE("slot %i - loaded image\n", slot->id);
                    img_sl.request_encode_image = true;
                    slot->images.push_back(img_sl);
                }
@ -921,6 +918,7 @@ struct llama_server_context
            llama_sampling_free(slot->ctx_sampling);
        }
        slot->ctx_sampling = llama_sampling_init(slot->sparams);
+        llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;

        all_slots_are_idle = false;
@ -1140,8 +1138,8 @@ struct llama_server_context
            {
                continue;
            }
-            clip_image_f32 img_res;
-            if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, /*pad2square =*/ true))
+            clip_image_f32 * img_res = clip_image_f32_init();
+            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
            {
                LOG_TEE("Error processing the given image");
                clip_free(clp_ctx);
@ -1156,11 +1154,12 @@ struct llama_server_context
                return false;
            }
            LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id);
-            if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, img.image_embedding))
+            if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding))
            {
                LOG_TEE("Unable to encode image\n");
                return false;
            }
+            clip_image_f32_free(img_res);
            img.request_encode_image = false;
        }

@ -1169,7 +1168,7 @@ struct llama_server_context

    void send_error(task_server& task, std::string error)
    {
-        std::lock_guard<std::mutex> lock(mutex_results);
+        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@ -1177,6 +1176,7 @@ struct llama_server_context
        res.error = true;
        res.result_json = { { "content", error } };
        queue_results.push_back(res);
+        condition_results.notify_all();
    }

    void add_multi_task(int id, std::vector<int>& sub_ids)
@ -1186,6 +1186,7 @@ struct llama_server_context
        multi.id = id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
+        condition_tasks.notify_one();
    }

    void update_multi_task(int multitask_id, int subtask_id, task_result& result)
@ -1197,6 +1198,7 @@ struct llama_server_context
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
+                condition_tasks.notify_one();
            }
        }
    }
@ -1215,7 +1217,7 @@ struct llama_server_context
            {"n_ctx",             slot.n_ctx},
            {"model",             params.model_alias},
            {"seed",              slot.params.seed},
-            {"temp",              slot.sparams.temp},
+            {"temperature",       slot.sparams.temp},
            {"top_k",             slot.sparams.top_k},
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
@ -1244,7 +1246,7 @@ struct llama_server_context

    void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
    {
-        std::lock_guard<std::mutex> lock(mutex_results);
+        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
        res.multitask_id = slot.multitask_id;
@ -1280,11 +1282,12 @@ struct llama_server_context
        }

        queue_results.push_back(res);
+        condition_results.notify_all();
    }

    void send_final_response(llama_client_slot &slot)
    {
-        std::lock_guard<std::mutex> lock(mutex_results);
+        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
        res.multitask_id = slot.multitask_id;
@ -1340,11 +1343,12 @@ struct llama_server_context
        }

        queue_results.push_back(res);
+        condition_results.notify_all();
    }

    void send_embedding(llama_client_slot &slot)
    {
-        std::lock_guard<std::mutex> lock(mutex_results);
+        std::unique_lock<std::mutex> lock(mutex_results);
        task_result res;
        res.id = slot.task_id;
        res.multitask_id = slot.multitask_id;
@ -1372,6 +1376,7 @@ struct llama_server_context
            };
        }
        queue_results.push_back(res);
+        condition_results.notify_all();
    }

    int request_completion(json data, bool infill, bool embedding, int multitask_id)
@ -1395,6 +1400,7 @@ struct llama_server_context

        // otherwise, it's a single-prompt task, we actually queue it
        queue_tasks.push_back(task);
+        condition_tasks.notify_one();
        return task.id;
    }

@ -1402,13 +1408,10 @@ struct llama_server_context
    {
        while (true)
        {
-            std::this_thread::sleep_for(std::chrono::microseconds(5));
-            std::lock_guard<std::mutex> lock(mutex_results);
-
-            if (queue_results.empty())
-            {
-                continue;
-            }
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return !queue_results.empty();
+            });

            for (int i = 0; i < (int) queue_results.size(); i++)
            {
@ -1504,12 +1507,13 @@ struct llama_server_context

    void request_cancel(int task_id)
    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
+        std::unique_lock<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
        task.type = CANCEL_TASK;
        task.target_id = task_id;
        queue_tasks.push_back(task);
+        condition_tasks.notify_one();
    }

    int split_multiprompt_task(task_server& multiprompt_task)
@ -1535,7 +1539,7 @@ struct llama_server_context

    void process_tasks()
    {
-        std::lock_guard<std::mutex> lock(mutex_tasks);
+        std::unique_lock<std::mutex> lock(mutex_tasks);
        while (!queue_tasks.empty())
        {
            task_server task = queue_tasks.front();
@ -1607,6 +1611,7 @@ struct llama_server_context

                std::lock_guard<std::mutex> lock(mutex_results);
                queue_results.push_back(aggregate_result);
+                condition_results.notify_all();

                queue_iterator = queue_multitasks.erase(queue_iterator);
            }
@ -1637,8 +1642,10 @@ struct llama_server_context
                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
                kv_cache_clear();
            }
-            // avoid 100% usage of cpu all time
-            std::this_thread::sleep_for(std::chrono::milliseconds(5));
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            condition_tasks.wait(lock, [&]{
+                return !queue_tasks.empty();
+            });
        }

        for (llama_client_slot &slot : slots)
@ -2437,26 +2444,33 @@ json oaicompat_completion_params_parse(
    llama_params["__oaicompat"] = true;

    // Map OpenAI parameters to llama.cpp parameters
+    //
+    // For parameters that are defined by the OpenAI documentation (e.g.
+    // temperature), we explicitly specify OpenAI's intended default; we
+    // need to do that because sometimes OpenAI disagrees with llama.cpp
+    //
+    // https://platform.openai.com/docs/api-reference/chat/create
+    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body, "model", std::string("uknown"));
    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
-    llama_params["temperature"]       = json_value(body, "temperature", 0.8);
-    llama_params["top_k"]             = json_value(body, "top_k", 40);
-    llama_params["top_p"]             = json_value(body, "top_p", 0.95);
+    llama_params["temperature"]       = json_value(body, "temperature", 0.0);
+    llama_params["top_k"]             = json_value(body, "top_k", default_sparams.top_k);
+    llama_params["top_p"]             = json_value(body, "top_p", 1.0);
    llama_params["n_predict"]         = json_value(body, "max_tokens", -1);
    llama_params["logit_bias"]        = json_value(body, "logit_bias",json::object());
    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
    llama_params["presence_penalty"]  = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"]              = json_value(body, "seed", 0);
+    llama_params["seed"]              = json_value(body, "seed", LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body, "stream", false);
-    llama_params["mirostat"]          = json_value(body, "mirostat", false);
-    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", 0.0);
-    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", 0.0);
-    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", false);
-    llama_params["typical_p"]         = json_value(body, "typical_p", 0.0);
-    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", 0);
+    llama_params["mirostat"]          = json_value(body, "mirostat", default_sparams.mirostat);
+    llama_params["mirostat_tau"]      = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
+    llama_params["mirostat_eta"]      = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
+    llama_params["penalize_nl"]       = json_value(body, "penalize_nl", default_sparams.penalize_nl);
+    llama_params["typical_p"]         = json_value(body, "typical_p", default_sparams.typical_p);
+    llama_params["repeat_last_n"]     = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
-    llama_params["tfs_z"]             = json_value(body, "tfs_z", 0.0);
+    llama_params["tfs_z"]             = json_value(body, "tfs_z", default_sparams.tfs_z);

    if (body.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
@ -3070,7 +3084,17 @@ int main(int argc, char **argv)
                {
                    prompt = "";
                }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
+
+                json image_data;
+                if (body.count("image_data") != 0) {
+                    image_data = body["image_data"];
+                }
+                else
+                {
+                    image_data = "";
+                }
+
+                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0}, {"image_data", image_data} }, false, true, -1);
                task_result result = llama.next_result(task_id);
                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
            });
--- a/flake.lock
+++ b/flake.lock
@ -1,30 +1,30 @@
 {
  "nodes": {
-    "flake-utils": {
+    "flake-parts": {
      "inputs": {
-        "systems": "systems"
+        "nixpkgs-lib": "nixpkgs-lib"
      },
      "locked": {
-        "lastModified": 1694529238,
-        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
+        "lastModified": 1701473968,
+        "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
        "type": "github"
      },
      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1698318101,
-        "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
+        "lastModified": 1703637592,
+        "narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
+        "rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
        "type": "github"
      },
      "original": {
@ -34,26 +34,29 @@
        "type": "github"
      }
    },
-    "root": {
-      "inputs": {
-        "flake-utils": "flake-utils",
-        "nixpkgs": "nixpkgs"
-      }
-    },
-    "systems": {
+    "nixpkgs-lib": {
      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "dir": "lib",
+        "lastModified": 1701253981,
+        "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
        "type": "github"
      },
      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
+        "dir": "lib",
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
        "type": "github"
      }
+    },
+    "root": {
+      "inputs": {
+        "flake-parts": "flake-parts",
+        "nixpkgs": "nixpkgs"
+      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@ -1,139 +1,144 @@
 {
+  description = "Port of Facebook's LLaMA model in C/C++";
+
  inputs = {
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    flake-utils.url = "github:numtide/flake-utils";
+    flake-parts.url = "github:hercules-ci/flake-parts";
  };
-  outputs = { self, nixpkgs, flake-utils }:
-    flake-utils.lib.eachDefaultSystem (system:
+
+  # Optional binary cache
+  nixConfig = {
+    extra-substituters = [
+      # Populated by the CI in ggerganov/llama.cpp
+      "https://llama-cpp.cachix.org"
+
+      # A development cache for nixpkgs imported with `config.cudaSupport = true`.
+      # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
+      # This lets one skip building e.g. the CUDA-enabled openmpi.
+      # TODO: Replace once nix-community obtains an official one.
+      "https://cuda-maintainers.cachix.org"
+    ];
+
+    # Verify these are the same keys as published on
+    # - https://app.cachix.org/cache/llama-cpp
+    # - https://app.cachix.org/cache/cuda-maintainers
+    extra-trusted-public-keys = [
+      "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
+      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
+    ];
+  };
+
+
+  # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
+  #
+  # ```bash
+  # ❯ nix repl
+  # nix-repl> :lf github:ggerganov/llama.cpp
+  # Added 13 variables.
+  # nix-repl> outputs.apps.x86_64-linux.quantize
+  # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/quantize"; type = "app"; }
+  # ```
+  outputs =
+    { self, flake-parts, ... }@inputs:
    let
-        name = "llama.cpp";
-        src = ./.;
-        meta.mainProgram = "llama";
-        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
-        buildInputs = with pkgs; [ openmpi ];
-        osSpecific = with pkgs; buildInputs ++ (
-          if isAarch64 && isDarwin then
-            with pkgs.darwin.apple_sdk_11_0.frameworks; [
-              Accelerate
-              MetalKit
-            ]
-          else if isAarch32 && isDarwin then
-            with pkgs.darwin.apple_sdk.frameworks; [
-              Accelerate
-              CoreGraphics
-              CoreVideo
-            ]
-          else if isDarwin then
-            with pkgs.darwin.apple_sdk.frameworks; [
-              Accelerate
-              CoreGraphics
-              CoreVideo
-            ]
-          else
-            with pkgs; [ openblas ]
-        );
-        pkgs = import nixpkgs { inherit system; };
-        nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
-        cudatoolkit_joined = with pkgs; symlinkJoin {
-          # HACK(Green-Sky): nix currently has issues with cmake findcudatoolkit
-          # see https://github.com/NixOS/nixpkgs/issues/224291
-          # copied from jaxlib
-          name = "${cudaPackages.cudatoolkit.name}-merged";
-          paths = [
-            cudaPackages.cudatoolkit.lib
-            cudaPackages.cudatoolkit.out
-          ] ++ lib.optionals (lib.versionOlder cudaPackages.cudatoolkit.version "11") [
-            # for some reason some of the required libs are in the targets/x86_64-linux
-            # directory; not sure why but this works around it
-            "${cudaPackages.cudatoolkit}/targets/${system}"
-          ];
-        };
-        llama-python =
-          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
-        # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-        llama-python-extra =
-          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
-        postPatch = ''
-          substituteInPlace ./ggml-metal.m \
-            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-          substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
-        '';
-        postInstall = ''
-          mv $out/bin/main $out/bin/llama
-          mv $out/bin/server $out/bin/llama-server
-          mkdir -p $out/include
-          cp ${src}/llama.h $out/include/
-        '';
-        cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
+      # We could include the git revisions in the package names but those would
+      # needlessly trigger rebuilds:
+      # llamaVersion = self.dirtyShortRev or self.shortRev;
+
+      # Nix already uses cryptographic hashes for versioning, so we'll just fix
+      # the fake semver for now:
+      llamaVersion = "0.0.0";
    in
+    flake-parts.lib.mkFlake { inherit inputs; }
+
      {
-        packages.default = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = osSpecific;
-          cmakeFlags = cmakeFlags
-            ++ (if isAarch64 && isDarwin then [
-            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-            "-DLLAMA_METAL=ON"
-          ] else [
-            "-DLLAMA_BLAS=ON"
-            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
-          ]);
-        };
-        packages.opencl = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs; buildInputs ++ [ clblast ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_CLBLAST=ON"
+
+        imports = [
+          .devops/nix/nixpkgs-instances.nix
+          .devops/nix/apps.nix
+          .devops/nix/devshells.nix
+          .devops/nix/jetson-support.nix
        ];
-        };
-        packages.cuda = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs; buildInputs ++ [ cudatoolkit_joined ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_CUBLAS=ON"
-          ];
-        };
-        packages.rocm = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
-          cmakeFlags = cmakeFlags ++ [
-            "-DLLAMA_HIPBLAS=1"
-            "-DCMAKE_C_COMPILER=hipcc"
-            "-DCMAKE_CXX_COMPILER=hipcc"
-            # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-            # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-            # and select the line that matches the current nixpkgs version of rocBLAS.
-            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-          ];
-        };
-        apps.llama-server = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/llama-server";
-        };
-        apps.llama-embedding = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/embedding";
-        };
-        apps.llama = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/llama";
-        };
-        apps.quantize = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/quantize";
-        };
-        apps.train-text-from-scratch = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
-        };
-        apps.default = self.apps.${system}.llama;
-        devShells.default = pkgs.mkShell {
-          buildInputs = [ llama-python ];
-          packages = nativeBuildInputs ++ osSpecific;
-        };
-        devShells.extra = pkgs.mkShell {
-          buildInputs = [ llama-python-extra ];
-          packages = nativeBuildInputs ++ osSpecific;
-        };
+
+        # An overlay can be used to have a more granular control over llama-cpp's
+        # dependencies and configuration, than that offered by the `.override`
+        # mechanism. Cf. https://nixos.org/manual/nixpkgs/stable/#chap-overlays.
+        #
+        # E.g. in a flake:
+        # ```
+        # { nixpkgs, llama-cpp, ... }:
+        # let pkgs = import nixpkgs {
+        #     overlays = [ (llama-cpp.overlays.default) ];
+        #     system = "aarch64-linux";
+        #     config.allowUnfree = true;
+        #     config.cudaSupport = true;
+        #     config.cudaCapabilities = [ "7.2" ];
+        #     config.cudaEnableForwardCompat = false;
+        # }; in {
+        #     packages.aarch64-linux.llamaJetsonXavier = pkgs.llamaPackages.llama-cpp;
+        # }
+        # ```
+        #
+        # Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format
+        flake.overlays.default =
+          (final: prev: {
+            llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
+            inherit (final.llamaPackages) llama-cpp;
          });
+
+        systems = [
+          "aarch64-darwin"
+          "aarch64-linux"
+          "x86_64-darwin" # x86_64-darwin isn't tested (and likely isn't relevant)
+          "x86_64-linux"
+        ];
+
+        perSystem =
+          {
+            config,
+            lib,
+            system,
+            pkgs,
+            pkgsCuda,
+            pkgsRocm,
+            ...
+          }:
+          {
+            # Unlike `.#packages`, legacyPackages may contain values of
+            # arbitrary types (including nested attrsets) and may even throw
+            # exceptions. This attribute isn't recursed into by `nix flake
+            # show` either.
+            #
+            # You can add arbitrary scripts to `.devops/nix/scope.nix` and
+            # access them as `nix build .#llamaPackages.${scriptName}` using
+            # the same path you would with an overlay.
+            legacyPackages = {
+              llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
+              llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
+              llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
+            };
+
+            # We don't use the overlay here so as to avoid making too many instances of nixpkgs,
+            # cf. https://zimbatm.com/notes/1000-instances-of-nixpkgs
+            packages =
+              {
+                default = config.legacyPackages.llamaPackages.llama-cpp;
+              }
+              // lib.optionalAttrs pkgs.stdenv.isLinux {
+                opencl = config.packages.default.override { useOpenCL = true; };
+                cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
+
+                mpi-cpu = config.packages.default.override { useMpi = true; };
+                mpi-cuda = config.packages.default.override { useMpi = true; };
+              }
+              // lib.optionalAttrs (system == "x86_64-linux") {
+                rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
+              };
+
+            # Packages exposed in `.#checks` will be built by the CI and by
+            # `nix flake check`. Currently we expose all packages, but we could
+            # make more granular choices
+            checks = config.packages;
+          };
+      };
 }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -614,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
 }

 static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_MUL_MAT:
+            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+        default:
            return true;
+    }

    GGML_UNUSED(backend);
-    GGML_UNUSED(op);
 }

 static struct ggml_backend_i cpu_backend_i = {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -119,7 +119,9 @@
 #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define CC_VOLTA      700
 #define CC_OFFSET_AMD 1000000
+#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+#define CC_RDNA3      (CC_OFFSET_AMD + 1100)

 #define GGML_CUDA_MAX_NODES 8192

@ -133,7 +135,6 @@

 // TODO: improve this to be correct for more hardware
 //       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-//       probably other such cases, and not sure what happens on AMD hardware
 #if !defined(GGML_CUDA_FORCE_MMQ)
 #define CUDA_USE_TENSOR_CORES
 #endif
@ -6662,7 +6663,7 @@ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
 // pool with virtual memory
 static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
 static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
-static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
+static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB

 static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
@ -7485,6 +7486,8 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;

+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_CUDA_F16
    cuda_pool_alloc<half> src1_dfloat_a;
@ -7577,6 +7580,7 @@ static void ggml_cuda_op_mul_mat_cublas(
    const int compute_capability = g_device_caps[id].cc;

    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+        //printf("this branch\n");
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
        cuda_pool_alloc<half> src0_as_f16;
        if (src0->type != GGML_TYPE_F16) {
@ -7614,9 +7618,9 @@ static void ggml_cuda_op_mul_mat_cublas(

        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    }
-    else {
+    } else {
        cuda_pool_alloc<float> src0_ddq_as_f32;
+        cuda_pool_alloc<float> src1_ddq_as_f32;

        if (src0->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
@ -7624,7 +7628,15 @@ static void ggml_cuda_op_mul_mat_cublas(
            src0_ddq_as_f32.alloc(row_diff*ne00);
            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
        }
+        if (src1->type != GGML_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
+            GGML_ASSERT(to_fp32_cuda != nullptr);
+            src1_ddq_as_f32.alloc(src1_ncols*ne10);
+            to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
+        }
+
        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
+        const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();

        const float alpha = 1.0f;
        const float beta = 0.0f;
@ -7634,7 +7646,7 @@ static void ggml_cuda_op_mul_mat_cublas(
            cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                    row_diff, src1_ncols, ne10,
                    &alpha, src0_ddf_i,  ne00,
-                            src1_ddf_i, ne10,
+                            src1_ddf1_i, ne10,
                    &beta,  dst_dd_i,    ldc));
    }

@ -8035,6 +8047,7 @@ static void ggml_cuda_op_mul_mat(

    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));

    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);

@ -8482,7 +8495,7 @@ static __global__ void k_compute_batched_ptrs(
    int64_t i02 = i12 / r2;

    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2 + i13*nbd3;
 }

@ -8492,28 +8505,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const

    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);

-    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
+    GGML_TENSOR_BINARY_OP_LOCALS

-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
+    const int64_t ne_dst = ggml_nelements(dst);

    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
@ -8522,7 +8517,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const

    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
    void * src0_ddq = src0_extra->data_device[g_main_device];
-    half * src0_as_f16 = (half *) src0_ddq;
+    half * src0_f16 = (half *) src0_ddq;

    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
@ -8531,11 +8526,15 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];

    // convert src1 to fp16
+    cuda_pool_alloc<half> src1_f16_alloc;
+    if (src1->type != GGML_TYPE_F16) {
        const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+        const int64_t ne_src1 = ggml_nelements(src1);
+        src1_f16_alloc.alloc(ne_src1);
        GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    cuda_pool_alloc<half> src1_as_f16(ne1);
-    to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
+        to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
+    }
+    half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();

    cuda_pool_alloc<half> dst_f16;
    char * dst_t;
@ -8557,7 +8556,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    const void * beta  = &beta_f16;

    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        dst_t = (char *) dst_f16.alloc(ne);
+        dst_t = (char *) dst_f16.alloc(ne_dst);

        nbd2 /= sizeof(float) / sizeof(half);
        nbd3 /= sizeof(float) / sizeof(half);
@ -8604,9 +8603,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        CUBLAS_CHECK(
        cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                alpha, (const char *) src0_as_f16,       CUDA_R_16F,   nb01/sizeof(half),  src0->nb[2]/sizeof(half),  // strideA
-                       (const char *) src1_as_f16.get(), CUDA_R_16F,   nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
-                beta,  (      char *)       dst_t,       cu_data_type, ne01,                dst->nb[2]/sizeof(float), // strideC
+                alpha, (const char *) src0_f16, CUDA_R_16F,   nb01/nb00, nb02/nb00,  // strideA
+                       (const char *) src1_f16, CUDA_R_16F,   nb11/nb10, nb12/nb10,  // strideB
+                beta,  (      char *)    dst_t, cu_data_type, ne01,       nb2/nb0,   // strideC
                ne12*ne13,
                cu_compute_type,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@ -8619,12 +8618,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const

        dim3 block_dims(ne13, ne12);
        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
-                src0_as_f16, src1_as_f16.get(), dst_t,
+                src0_f16, src1_f16, dst_t,
                ptrs_src.get(), ptrs_dst.get(),
                ne12, ne13,
                ne23,
                nb02, nb03,
-                nb12, nb13,
+                src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
+                src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
                nbd2, nbd3,
                r2, r3);
        CUDA_CHECK(cudaGetLastError());
@ -8632,8 +8632,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/sizeof(half),
-                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/sizeof(float),
+                alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F,   nb01/nb00,
+                       (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F,   nb11/nb10,
                beta,  (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
                ne23,
                cu_compute_type,
@ -8643,7 +8643,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const

    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
+        to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
    }
 }

@ -8662,11 +8662,25 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
        }
    }

+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+
+    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
+    bool               use_mul_mat_q = ggml_is_quantized(src0->type);
 #ifdef CUDA_USE_TENSOR_CORES
-    const bool use_tensor_cores = true;
+    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
+#endif // CUDA_USE_TENSOR_CORES
+
 #else
-    const bool use_tensor_cores = false;
-#endif
+
+    const bool fp16_performance_good = min_compute_capability >= CC_VOLTA;
+    bool               use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+#ifdef CUDA_USE_TENSOR_CORES
+    // when tensor cores are available, use them for large batch size
+    // ref: https://github.com/ggerganov/llama.cpp/pull/3776
+    use_mul_mat_q = use_mul_mat_q && !(fp16_performance_good && src1->ne[1] > MMQ_MAX_BATCH_SIZE);
+#endif // CUDA_USE_TENSOR_CORES
+
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

    // debug helpers
    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
@ -8676,19 +8690,19 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);

-    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+    } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
    } else if (src0->type == GGML_TYPE_F32) {
        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
 #ifdef GGML_CUDA_FORCE_DMMV
            const bool use_mul_mat_vec_q = false;
 #else
@ -8702,14 +8716,6 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
            }
        } else {
-            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
-
-            // when tensor cores are available, use them for large batch size
-            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
-                use_mul_mat_q = false;
-            }
-
            if (use_mul_mat_q) {
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
            } else {
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -6,19 +6,19 @@
 extern "C" {
 #endif

-void ggml_cl_init(void);
+GGML_API void ggml_cl_init(void);

-void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

-void * ggml_cl_host_malloc(size_t size);
-void   ggml_cl_host_free(void * ptr);
+GGML_API void * ggml_cl_host_malloc(size_t size);
+GGML_API void   ggml_cl_host_free(void * ptr);

-void ggml_cl_free_data(const struct ggml_tensor* tensor);
+GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);

-void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);

 #ifdef  __cplusplus
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -410,13 +410,17 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {

 #if !defined(__ARM_FEATURE_DOTPROD)

-inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));

    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
 }

+#else
+
+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
 #endif

 #endif
@ -2481,8 +2485,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
@ -2769,8 +2773,8 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
@ -2936,11 +2940,11 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }

    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3228,11 +3232,11 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
    }

    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@ -3483,12 +3487,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));

        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }

    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3598,8 +3602,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 // We use this macro instead of a function call because for some reason
 // the code runs 2-3% slower, even if the function is declared inline
 #define MULTIPLY_ACCUM_WITH_SCALE(index)\
-        isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
-        isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];

 #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
@ -3973,10 +3977,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
        q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));

-        isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
-        isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
-        isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
+        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
+        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
+        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
+        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];

        sum += d * (isum1 + isum2);
    }
@ -4256,10 +4260,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));

-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];

            scale += 4;

@ -4273,10 +4277,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));

-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];

            scale += 4;

@ -4757,10 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
        q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6),                q3h.val[3]));

-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];

        sum += d * isum;

@ -5109,14 +5113,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));

-            const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
            sumi1 += vaddvq_s32(p1) * scales[2*j+0];

            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));

-            const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);

            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
        }
@ -5449,13 +5453,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
        q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));

-        const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+        const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
        const int32_t sumi1 = vaddvq_s32(p1) * scales[0];

        q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
        q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));

-        const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
+        const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
        const int32_t sumi2 = vaddvq_s32(p2) * scales[1];

        sumf += d * (sumi1 + sumi2);
@ -5722,8 +5726,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));

-            sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
-            sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
        }

        sumf += d * sumi - dmin * sumi_mins;
@ -6112,10 +6116,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
        q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));

-        int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
-        int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
-        int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
-        int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
+        int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
+        int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
+        int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
+        int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));

        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
    }
@ -6399,10 +6403,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));

-            isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];

            scale += 4;

@ -6426,10 +6430,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));

-            isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
            scale += 4;
        }
        //sum += isum * d_all * y[i].d;
@ -6816,10 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
        q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
        q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);

-        isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];

        sum += isum * d_all * y[i].d;

--- a/ggml.c
+++ b/ggml.c
@ -9687,7 +9687,7 @@ static void ggml_compute_forward_mul_mat(
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);

            assert(params->wsize >= ne11*ne12*ne13*row_size);
-            assert(src1->type == GGML_TYPE_F32);
+            GGML_ASSERT(src1->type == GGML_TYPE_F32);

            for (int64_t i13 = 0; i13 < ne13; ++i13) {
                for (int64_t i12 = 0; i12 < ne12; ++i12) {
@ -19638,6 +19638,14 @@ int ggml_cpu_has_avx(void) {
 #endif
 }

+int ggml_cpu_has_avx_vnni(void) {
+#if defined(__AVXVNNI__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_avx2(void) {
 #if defined(__AVX2__)
    return 1;
--- a/ggml.h
+++ b/ggml.h
@ -2198,6 +2198,7 @@ extern "C" {
    //

    GGML_API int ggml_cpu_has_avx        (void);
+    GGML_API int ggml_cpu_has_avx_vnni   (void);
    GGML_API int ggml_cpu_has_avx2       (void);
    GGML_API int ggml_cpu_has_avx512     (void);
    GGML_API int ggml_cpu_has_avx512_vbmi(void);
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -370,7 +370,16 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.GPT2: [
-        # TODO
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.PHI2: [
        MODEL_TENSOR.TOKEN_EMBD,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -17,6 +17,7 @@ class TensorNameMap:
            "tok_embeddings",                            # llama-pth
            "embeddings.word_embeddings",                # bert
            "language_model.embedding.word_embeddings",  # persimmon
+            "wte",                                       # gpt2
            "transformer.embd.wte",                      # phi2
        ),

@ -34,6 +35,7 @@ class TensorNameMap:
        MODEL_TENSOR.POS_EMBD: (
            "transformer.wpe",                 # gpt2
            "embeddings.position_embeddings",  # bert
+            "wpe",                             # gpt2
        ),

        # Output
@ -53,7 +55,7 @@ class TensorNameMap:
            "norm",                                    # llama-pth
            "embeddings.LayerNorm",                    # bert
            "transformer.norm_f",                      # mpt
-            "ln_f",                                    # refact bloom qwen
+            "ln_f",                                    # refact bloom qwen gpt2
            "language_model.encoder.final_layernorm",  # persimmon
            "lm_head.ln",                              # phi2
        ),
@ -78,6 +80,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.LayerNorm",       # bert
            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
            "model.layers.{bid}.ln1",                               # yi
+            "h.{bid}.ln_1",                                         # gpt2
            "transformer.h.{bid}.ln",                               # phi2
            "model.layers.layers.{bid}.norm",                       # plamo
        ),
@ -95,6 +98,7 @@ class TensorNameMap:
            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
            "h.{bid}.self_attention.query_key_value",                              # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
+            "h.{bid}.attn.c_attn",                                                 # gpt2
            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
        ),

@ -137,6 +141,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.dense",                # bert
            "transformer.h.{bid}.attn.out_proj",                         # gpt-j
            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
+            "h.{bid}.attn.c_proj",                                       # gpt2
            "transformer.h.{bid}.mixer.out_proj",                        # phi2
            "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
        ),
@ -159,6 +164,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.output.LayerNorm",                          # bert
            "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
            "model.layers.{bid}.ln2",                                        # yi
+            "h.{bid}.ln_2",                                                  # gpt2
        ),

        MODEL_TENSOR.FFN_GATE_INP: (
@ -179,6 +185,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
            "transformer.h.{bid}.mlp.w1",                             # qwen
+            "h.{bid}.mlp.c_fc",                                       # gpt2
            "transformer.h.{bid}.mlp.fc1",                            # phi2
            "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
        ),
@ -218,6 +225,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.output.dense",                       # bert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
+            "h.{bid}.mlp.c_proj",                                     # gpt2
            "transformer.h.{bid}.mlp.fc2",                            # phi2
            "model.layers.layers.{bid}.mlp.down_proj",                # plamo
        ),
--- a/llama.cpp
+++ b/llama.cpp
@ -423,6 +423,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
        LLM_ARCH_GPT2,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_POS_EMBD,        "position_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
        },
    },
    {
@ -1256,6 +1265,10 @@ enum e_model {
    MODEL_40B,
    MODEL_65B,
    MODEL_70B,
+    MODEL_SMALL,
+    MODEL_MEDIUM,
+    MODEL_LARGE,
+    MODEL_XL,
 };

 static const size_t kiB = 1024;
@ -2563,6 +2576,10 @@ static const char * llama_model_type_name(e_model type) {
        case MODEL_40B:    return "40B";
        case MODEL_65B:    return "65B";
        case MODEL_70B:    return "70B";
+        case MODEL_SMALL:  return "0.1B";
+        case MODEL_MEDIUM: return "0.4B";
+        case MODEL_LARGE:  return "0.8B";
+        case MODEL_XL:     return "1.5B";
        default:           return "?B";
    }
 }
@ -2782,6 +2799,17 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
               }
            } break;
+        case LLM_ARCH_GPT2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                switch (hparams.n_layer) {
+                    case 12: model.type = e_model::MODEL_SMALL; break;
+                    case 24: model.type = e_model::MODEL_MEDIUM; break;
+                    case 36: model.type = e_model::MODEL_LARGE; break;
+                    case 48: model.type = e_model::MODEL_XL; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;

        default: (void)0;
    }
@ -3710,6 +3738,60 @@ static bool llm_load_tensors(
                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
                    }
                } break;
+            case LLM_ARCH_GPT2:
+                {
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
+                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+
+                    // output
+                    {
+                        ggml_backend_type backend_norm;
+                        ggml_backend_type backend_output;
+
+                        if (n_gpu_layers > int(n_layer)) {
+                            backend_norm   = llama_backend_offload;
+                            backend_output = llama_backend_offload_split;
+                        } else {
+                            backend_norm   = GGML_BACKEND_CPU;
+                            backend_output = GGML_BACKEND_CPU;
+                        }
+
+                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
+                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
+                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                    }
+
+                    const uint32_t n_ff = hparams.n_ff;
+
+                    const int i_gpu_start = n_layer - n_gpu_layers;
+
+                    model.layers.resize(n_layer);
+
+                    for (uint32_t i = 0; i < n_layer; ++i) {
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
+                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
+                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
+
+                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+
+                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
+                    }
+                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@ -5754,6 +5836,102 @@ struct llm_build_context {

        return gf;
    }
+
+    struct ggml_cgraph * build_gpt2() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * pos;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        cb(inp_pos, "inp_pos", -1);
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+        cb(KQ_mask, "KQ_mask", -1);
+
+        pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
+        cb(pos, "pos_embd", -1);
+
+        inpL = ggml_add(ctx0, inpL, pos);
+        cb(inpL, "inpL", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm,
+                    model.layers[il].attn_norm_b,
+                    LLM_NORM, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                cb(cur, "bqkv", il);
+
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
+
+                cb(Qcur, "Qcur", il);
+                cb(Kcur, "Kcur", il);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+
+                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
+
+                cur = llm_build_kqv(ctx0, model, hparams, kv_self,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                cb(cur, "kqv_out", il);
+            }
+
+            // add the input
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // FF
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm,
+                        model.layers[il].ffn_norm_b,
+                        LLM_NORM, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, cur,
+                        model.layers[il].ffn_up,   model.layers[il].ffn_up_b,
+                        NULL,                      NULL,
+                        model.layers[il].ffn_down, model.layers[il].ffn_down_b,
+                        NULL,
+                        LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            inpL = ggml_add(ctx0, cur, ffn_inp);
+            cb(inpL, "l_out", il);
+        }
+
+        cur = llm_build_norm(ctx0, inpL, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };

 //
@ -6269,6 +6447,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_plamo();
            } break;
+        case LLM_ARCH_GPT2:
+            {
+                result = llm.build_gpt2();
+            } break;
        default:
            GGML_ASSERT(false);
    }
@ -10598,6 +10780,7 @@ const char * llama_print_system_info(void) {

    s  = "";
    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
+    s += "AVX_VNNI = "    + std::to_string(ggml_cpu_has_avx_vnni())    + " | ";
    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
--- a/models/ggml-vocab-gpt2.gguf
+++ b/models/ggml-vocab-gpt2.gguf
--- a/requirements-hf-to-gguf.txt
+++ b/requirements-hf-to-gguf.txt
@ -1,3 +0,0 @@
-r requirements.txt
-torch==2.1.1
-transformers==4.35.2
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,12 @@
-numpy==1.24.4
-sentencepiece==0.1.98
-transformers>=4.34.0
-gguf>=0.1.0
-protobuf>=4.21.0
+# These requirements include all dependencies for all top-level python scripts
+# for llama.cpp. Avoid adding packages here directly.
+#
+# Package versions must stay compatible across all top-level python scripts.
+#
+
+-r ./requirements/requirements-convert.txt
+
+-r ./requirements/requirements-convert-hf-to-gguf.txt
+-r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
+-r ./requirements/requirements-convert-lora-to-ggml.txt
+-r ./requirements/requirements-convert-persimmon-to-gguf.txt
--- a/requirements/requirements-convert-hf-to-gguf.txt
+++ b/requirements/requirements-convert-hf-to-gguf.txt
@ -0,0 +1,2 @@
+-r ./requirements-convert.txt
+torch~=2.1.1
--- a/requirements/requirements-convert-llama-ggml-to-gguf.txt
+++ b/requirements/requirements-convert-llama-ggml-to-gguf.txt
@ -0,0 +1 @@
+-r ./requirements-convert.txt
--- a/requirements/requirements-convert-lora-to-ggml.txt
+++ b/requirements/requirements-convert-lora-to-ggml.txt
@ -0,0 +1,2 @@
+-r ./requirements-convert.txt
+torch~=2.1.1
--- a/requirements/requirements-convert-persimmon-to-gguf.txt
+++ b/requirements/requirements-convert-persimmon-to-gguf.txt
@ -0,0 +1,2 @@
+-r ./requirements-convert.txt
+torch~=2.1.1
--- a/requirements/requirements-convert.txt
+++ b/requirements/requirements-convert.txt
@ -0,0 +1,5 @@
+numpy~=1.24.4
+sentencepiece~=0.1.98
+transformers>=4.35.2,<5.0.0
+gguf>=0.1.0
+protobuf>=4.21.0,<5.0.0
--- a/scripts/check-requirements.sh
+++ b/scripts/check-requirements.sh
@ -0,0 +1,174 @@
+#!/bin/bash
+set -euo pipefail
+
+#
+# check-requirements.sh checks all requirements files for each top-level
+# convert*.py script.
+#
+# WARNING: This is quite IO intensive, because a fresh venv is set up for every
+# python script. As of 2023-12-22, this writes ~2.7GB of data. An adequately
+# sized tmpfs /tmp or ramdisk is recommended if running this frequently.
+#
+# usage:    check-requirements.sh [<working_dir>]
+#           check-requirements.sh nocleanup [<working_dir>]
+#
+# where:
+#           - <working_dir> is a directory that can be used as the base for
+#               setting up the venvs. Defaults to `/tmp`.
+#           - 'nocleanup' as the first argument will disable automatic cleanup
+#               of the files created by this script.
+#
+# requires:
+#           - bash >= 3.2.57
+#           - shellcheck
+#
+# For each script, it creates a fresh venv, `pip install`s the requirements, and
+# finally imports the python script to check for `ImportError`.
+#
+
+log() {
+    local level=$1 msg=$2
+    printf >&2 '%s: %s\n' "$level" "$msg"
+}
+
+debug() {
+    log DEBUG "$@"
+}
+
+info() {
+    log INFO "$@"
+}
+
+fatal() {
+    log FATAL "$@"
+    exit 1
+}
+
+cleanup() {
+    if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
+        info "Removing $workdir"
+        local count=0
+        rm -rfv -- "$workdir" | while read -r; do
+            if (( count++ > 750 )); then
+                printf .
+                count=0
+            fi
+        done
+        printf '\n'
+        info "Removed $workdir"
+    fi
+}
+
+do_cleanup=1
+if [[ ${1-} == nocleanup ]]; then
+    do_cleanup=0; shift
+fi
+
+if (( do_cleanup )); then
+    trap exit INT TERM
+    trap cleanup EXIT
+fi
+
+this=$(realpath -- "$0"); readonly this
+cd "$(dirname "$this")/.." # PWD should stay in llama.cpp project directory
+
+shellcheck "$this"
+
+readonly reqs_dir=requirements
+
+if [[ ${1+x} ]]; then
+    tmp_dir=$(realpath -- "$1")
+    if [[ ! ( -d $tmp_dir && -w $tmp_dir ) ]]; then
+        fatal "$tmp_dir is not a writable directory"
+    fi
+else
+    tmp_dir=/tmp
+fi
+
+workdir=$(mktemp -d "$tmp_dir/check-requirements.XXXX"); readonly workdir
+info "Working directory: $workdir"
+
+check_requirements() {
+    local reqs=$1
+
+    info "$reqs: beginning check"
+    pip --disable-pip-version-check install -qr "$reqs"
+    info "$reqs: OK"
+}
+
+check_convert_script() {
+    local py=$1             # e.g. ./convert-hf-to-gguf.py
+    local pyname=${py##*/}  # e.g. convert-hf-to-gguf.py
+    pyname=${pyname%.py}    # e.g. convert-hf-to-gguf
+
+    info "$py: beginning check"
+
+    local reqs="$reqs_dir/requirements-$pyname.txt"
+    if [[ ! -r $reqs ]]; then
+        fatal "$py missing requirements. Expected: $reqs"
+    fi
+
+    local venv="$workdir/$pyname-venv"
+    python3 -m venv "$venv"
+
+    (
+        # shellcheck source=/dev/null
+        source "$venv/bin/activate"
+
+        check_requirements "$reqs"
+
+        python - "$py" "$pyname" <<'EOF'
+import sys
+from importlib.machinery import SourceFileLoader
+py, pyname = sys.argv[1:]
+SourceFileLoader(pyname, py).load_module()
+EOF
+    )
+
+    if (( do_cleanup )); then
+        rm -rf -- "$venv"
+    fi
+
+    info "$py: imports OK"
+}
+
+readonly ignore_eq_eq='check_requirements: ignore "=="'
+
+for req in "$reqs_dir"/*; do
+    # Check that all sub-requirements are added to top-level requirements.txt
+    if ! grep -qF "$req" requirements.txt; then
+        fatal "$req needs to be added to requirements.txt"
+    fi
+
+    # Make sure exact release versions aren't being pinned in the requirements
+    # Filters out the ignore string
+    if grep -vF "$ignore_eq_eq" "$req" | grep -q '=='; then
+        tab=$'\t'
+        cat >&2 <<EOF
+FATAL: Avoid pinning exact package versions. Use '~=' instead.
+You can suppress this error by appending the following to the line:
+$tab# $ignore_eq_eq
+EOF
+        exit 1
+    fi
+done
+
+all_venv="$workdir/all-venv"
+python3 -m venv "$all_venv"
+
+(
+    # shellcheck source=/dev/null
+    source "$all_venv/bin/activate"
+    check_requirements requirements.txt
+)
+
+if (( do_cleanup )); then
+    rm -rf -- "$all_venv"
+fi
+
+check_convert_script convert.py
+for py in convert-*.py; do
+    check_convert_script "$py"
+done
+
+info 'Done! No issues found.'
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -27,8 +27,20 @@ echo "Syncing ggml changes since commit $lc"
 cd $SRC_GGML

 git log --oneline $lc..HEAD
+git log --oneline $lc..HEAD | grep -v "(llama/[0-9]*)" | cut -d' ' -f1 > $SRC_LLAMA/ggml-commits

-git format-patch $lc --stdout -- \
+if [ ! -s $SRC_LLAMA/ggml-commits ]; then
+    rm -v $SRC_LLAMA/ggml-commits
+    echo "No new commits"
+    exit 0
+fi
+
+if [ -f $SRC_LLAMA/ggml-src.patch ]; then
+    rm -v $SRC_LLAMA/ggml-src.patch
+fi
+
+while read c; do
+    git format-patch -k $c~1..$c --stdout -- \
        include/ggml/ggml*.h \
        src/ggml*.h \
        src/ggml*.c \
@ -41,7 +53,10 @@ git format-patch $lc --stdout -- \
        tests/test-quantize-fns.cpp \
        tests/test-quantize-perf.cpp \
        tests/test-backend-ops.cpp \
-    > $SRC_LLAMA/ggml-src.patch
+        >> $SRC_LLAMA/ggml-src.patch
+done < $SRC_LLAMA/ggml-commits
+
+rm -v $SRC_LLAMA/ggml-commits

 # delete files if empty
 if [ ! -s $SRC_LLAMA/ggml-src.patch ]; then
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-76e7f47b69e8334384dc718480c496dafbd47999
+df098ea908764cba4a4889a1cbe7b026b2d31a14
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -2,7 +2,7 @@ function(llama_build_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_executable(${TEST_TARGET} ${source})
    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
 endfunction()

 function(llama_test_executable name source)
@ -14,7 +14,7 @@ function(llama_build_and_test_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_executable(${TEST_TARGET} ${source})
    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()

@ -41,6 +41,7 @@ llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cp
 llama_test_executable (test-tokenizer-1-gpt-neox         test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
 llama_test_executable (test-tokenizer-1-refact           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test_executable (test-tokenizer-1-starcoder        test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+llama_test_executable (test-tokenizer-1-gpt2             test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
 # llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG

 llama_build_and_test_executable(test-grammar-parser.cpp)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -350,13 +350,18 @@ struct test_case {
        fflush(stdout);

        // check if backends support op
+        bool supported = true;
        for (ggml_backend_t backend : {backend1, backend2}) {
            if (!ggml_backend_supports_op(backend, out)) {
-                printf("not supported\n");
+                printf("not supported [%s] ", ggml_backend_name(backend));
+                supported = false;
+            }
+        }
+        if (!supported) {
+            printf("\n");
            ggml_free(ctx);
            return true;
        }
-        }

        // post-graph sentinel
        add_sentinel(ctx);
@ -1505,8 +1510,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }

    for (ggml_type type_a : all_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-            // FIXME: CPU crashes on f16xf16
+        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {1, 1}));
            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {2, 1}));