Merge branch 'master' into sycl_readme_update

2024-03-25 10:36:24 +00:00 · 2024-03-25 10:36:24 +00:00 · d6dcd1738e
commit d6dcd1738e
parent a80267a110 ad3a0505e3
68 changed files with 4471 additions and 2635 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -15,6 +15,10 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
@ -41,7 +45,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@ -97,7 +101,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@ -135,6 +139,9 @@ jobs:

  ubuntu-focal-make:
    runs-on: ubuntu-20.04
+    env:
+      LLAMA_NODE_AVAILABLE: true
+      LLAMA_PYTHON_AVAILABLE: true

    steps:
      - name: Clone
@ -147,6 +154,14 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential gcc-8

+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+
      - name: Build
        id: make_build
        env:
@ -210,6 +225,17 @@ jobs:
          cd build
          ctest -L main --verbose --timeout 900

+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
 #  ubuntu-latest-cmake-sanitizer:
 #    runs-on: ubuntu-latest
 #
@ -774,6 +800,7 @@ jobs:

  windows-latest-cmake-sycl:
    runs-on: windows-latest
+
    defaults:
      run:
        shell: bash
@ -782,7 +809,6 @@ jobs:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel

-
    steps:
      - name: Clone
        id: checkout
@ -797,6 +823,32 @@ jobs:
        id: cmake_build
        run:  examples/sycl/win-build-sycl.bat

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip .\build\bin\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+
  ios-xcode-build:
    runs-on: macos-latest

--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@ -19,5 +19,5 @@ jobs:
          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
          days-before-pr-stale: -1
          days-before-pr-close: -1
-          operations-per-run: 1000
+          operations-per-run: 10000
          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@ -5,6 +5,10 @@ env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  run:
    runs-on: ubuntu-20.04
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -15,6 +15,10 @@ on:
    branches:
      - master

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@ -14,6 +14,10 @@ on:
    branches:
      - master

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  editorconfig:
    runs-on: ubuntu-latest
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -17,6 +17,10 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -8,6 +8,10 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  nix-eval:
    strategy:
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@ -16,6 +16,10 @@ on:
      - 'requirements.txt'
      - 'requirements/*.txt'

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  python-check-requirements:
    runs-on: ubuntu-latest
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -2,6 +2,10 @@ name: flake8 Lint

 on: [push, pull_request]

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  flake8-lint:
    runs-on: ubuntu-latest
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -18,6 +18,10 @@ on:
  schedule:
    -  cron: '0 0 * * *'

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  server:
    runs-on: ubuntu-latest
@ -31,7 +35,6 @@ jobs:
        include:
          - build_type: Release
            sanitizer: ""
-            disabled_on_pr: true
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken

    container:
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@ -6,6 +6,10 @@ on:
    branches:
      - master

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  build:
    strategy:
--- a/.gitignore
+++ b/.gitignore
@ -50,6 +50,7 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
+/gguf-split
 /gritlm
 /imatrix
 /infill
@ -58,6 +59,9 @@ models-mnt
 /llava-cli
 /lookahead
 /lookup
+/lookup-create
+/lookup-merge
+/lookup-stats
 /main
 /metal
 /passkey
@ -73,6 +77,7 @@ models-mnt
 /batched-bench
 /export-lora
 /finetune
+/retrieval
 /speculative
 /parallel
 /train-text-from-scratch
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
+option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
@ -387,6 +388,9 @@ if (LLAMA_CUBLAS)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
+        if (LLAMA_CUDA_NO_PEER_COPY)
+            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+        endif()

        if (LLAMA_STATIC)
            if (WIN32)
@ -531,6 +535,10 @@ if (LLAMA_HIPBLAS)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()

+    if (LLAMA_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
--- a/30
+++ b/30
@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
+	simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search  \
+	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
@ -452,9 +452,9 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-#ifdef LLAMA_CUDA_CUBLAS
-#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
-#endif # LLAMA_CUDA_CUBLAS
+ifdef LLAMA_CUDA_NO_PEER_COPY
+	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # LLAMA_CUDA_NO_PEER_COPY
 ifdef LLAMA_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
@ -535,6 +535,9 @@ endif # LLAMA_HIP_UMA
 ifdef LLAMA_CUDA_FORCE_DMMV
 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
+ifdef LLAMA_CUDA_NO_PEER_COPY
+	HIPFLAGS 	+= -DGGML_CUDA_NO_PEER_COPY
+endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
@ -673,6 +676,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t
 train.o: common/train.cpp common/train.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

@ -680,7 +686,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)

 clean:
-	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 	find examples pocs -type f -name "*.o" -delete

 #
@ -798,6 +804,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -810,9 +820,15 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)

 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/README-sycl.md
+++ b/README-sycl.md
@ -37,6 +37,7 @@ When targetting **Intel CPUs**, it is recommended to  use llama.cpp for [x86_64]
 ## News

 - 2024.3
+  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
--- a/README.md
+++ b/README.md
@ -17,10 +17,12 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

+- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
 - Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187

 ----

@ -165,6 +167,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [RecurseChat](https://recurse.chat/) (proprietary)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -65,6 +65,8 @@ add_library(${TARGET} STATIC
    json.hpp
    train.h
    train.cpp
+    ngram-cache.h
+    ngram-cache.cpp
    )

 if (BUILD_SHARED_LIBS)
--- a/common/common.cpp
+++ b/common/common.cpp
@ -39,6 +39,9 @@
 #endif
 #if defined(LLAMA_USE_CURL)
 #include <curl/curl.h>
+#include <curl/easy.h>
+#include <thread>
+#include <future>
 #endif

 #if defined(_MSC_VER)
@ -61,7 +64,7 @@
 #else
 #include <sys/syslimits.h>
 #endif
-#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 #define LLAMA_CURL_MAX_HEADER_LENGTH 256
 #endif // LLAMA_USE_CURL

@ -154,8 +157,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    return result;
 }

-static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int & i, bool & invalid_param) {
-    std::string arg = argv[i];
+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
    llama_sampling_params& sparams = params.sparams;

    if (arg == "-s" || arg == "--seed") {
@ -648,14 +650,6 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
        params.model = argv[i];
        return true;
    }
-    if (arg == "-mu" || arg == "--model-url") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        params.model_url = argv[i];
-        return true;
-    }
    if (arg == "-md" || arg == "--model-draft") {
        if (++i >= argc) {
            invalid_param = true;
@ -672,6 +666,30 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
        params.model_alias = argv[i];
        return true;
    }
+    if (arg == "-mu" || arg == "--model-url") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.model_url = argv[i];
+        return true;
+    }
+    if (arg == "-hfr" || arg == "--hf-repo") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.hf_repo = argv[i];
+        return true;
+    }
+    if (arg == "-hff" || arg == "--hf-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.hf_file = argv[i];
+        return true;
+    }
    if (arg == "--lora") {
        if (++i >= argc) {
            invalid_param = true;
@ -948,6 +966,22 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
        }
        return true;
    }
+    if (arg == "-lcs" || arg == "--lookup-cache-static") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.lookup_cache_static = argv[i];
+        return true;
+    }
+    if (arg == "-lcd" || arg == "--lookup-cache-dynamic") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.lookup_cache_dynamic = argv[i];
+        return true;
+    }
    if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
        if (++i >= argc) {
            invalid_param = true;
@ -1201,13 +1235,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }

-        if (!gpt_params_find_arg(argc, argv, params, i, invalid_param)) {
+        if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
    }
+
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }
+
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
             params.instruct)) {
@ -1215,6 +1251,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

+    // short-hand to avoid specifying --hf-file -> default it to --model
+    if (!params.hf_repo.empty() && params.hf_file.empty()) {
+        params.hf_file = params.model;
+    }
+
    if (params.escape) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
@ -1404,12 +1445,20 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
-    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                        model download url (default: %s)\n", params.model_url.c_str());
    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding\n");
+    printf("                        draft model for speculative decoding (default: unused)\n");
+    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
+    printf("                        model download url (default: unused)\n");
+    printf("  -hfr REPO, --hf-repo REPO\n");
+    printf("                        Hugging Face model repository (default: unused)\n");
+    printf("  -hff FILE, --hf-file FILE\n");
+    printf("                        Hugging Face model file (default: unused)\n");
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
+    printf("  -lcs FNAME, --lookup-cache-static FNAME\n");
+    printf("                        path to static lookup cache to use for lookup decoding (not updated by generation)\n");
+    printf("  -lcd FNAME, --lookup-cache-dynamic FNAME\n");
+    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
@ -1656,25 +1705,13 @@ void llama_batch_add(

 #ifdef LLAMA_USE_CURL

-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
-                                              struct llama_model_params params) {
-    // Basic validation of the model_url
-    if (!model_url || strlen(model_url) == 0) {
-        fprintf(stderr, "%s: invalid model_url\n", __func__);
-        return NULL;
-    }
-
-    // Initialize libcurl globally
-    auto curl = curl_easy_init();
-
-    if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
-        return NULL;
-    }
+static bool llama_download_file(CURL * curl, const char * url, const char * path) {
+    bool force_download = false;

    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl, CURLOPT_URL, model_url);
+    curl_easy_setopt(curl, CURLOPT_URL, url);
    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
@ -1683,16 +1720,16 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha

    // Check if the file already exists locally
    struct stat model_file_info;
-    auto file_exists = (stat(path_model, &model_file_info) == 0);
+    auto file_exists = (stat(path, &model_file_info) == 0);

    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
-    snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
+    char etag_path[PATH_MAX] = {0};
+    snprintf(etag_path, sizeof(etag_path), "%s.etag", path);

    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
-    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
+    char last_modified_path[PATH_MAX] = {0};
+    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);

    if (file_exists) {
        auto * f_etag = fopen(etag_path, "r");
@ -1700,7 +1737,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            if (!fgets(etag, sizeof(etag), f_etag)) {
                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
            } else {
-                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
+                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
            }
            fclose(f_etag);
        }
@ -1710,7 +1747,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
            } else {
-                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
+                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
                        last_modified);
            }
            fclose(f_last_modified);
@ -1728,6 +1765,11 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;

+            // Convert header field name to lowercase
+            for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
+                buffer[i] = tolower(buffer[i]);
+            }
+
            const char * etag_prefix = "etag: ";
            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
@ -1750,7 +1792,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
        if (res != CURLE_OK) {
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
-            return NULL;
+            return false;
        }

        long http_code = 0;
@ -1758,30 +1800,34 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
        if (http_code != 200) {
            // HEAD not supported, we don't know if the file has changed
            // force trigger downloading
-            file_exists = false;
+            force_download = true;
            fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
        }
    }

    // If the ETag or the Last-Modified headers are different: trigger a new download
-    if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
-        char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
-        snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
+    bool should_download = !file_exists
+        || force_download
+        || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
+        || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
+    if (should_download) {
+        char path_temporary[PATH_MAX] = {0};
+        snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
        if (file_exists) {
-            fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
-            if (remove(path_model) != 0) {
+            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
+            if (remove(path) != 0) {
                curl_easy_cleanup(curl);
-                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
-                return NULL;
+                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
+                return false;
            }
        }

        // Set the output file
-        auto * outfile = fopen(path_model_temporary, "wb");
+        auto * outfile = fopen(path_temporary, "wb");
        if (!outfile) {
            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
-            return NULL;
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
+            return false;
        }

        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
@ -1795,15 +1841,30 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
        //  display download progress
        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);

+        // helper function to hide password in URL
+        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+            std::size_t protocol_pos = url.find("://");
+            if (protocol_pos == std::string::npos) {
+                return url;  // Malformed URL
+            }
+
+            std::size_t at_pos = url.find('@', protocol_pos + 3);
+            if (at_pos == std::string::npos) {
+                return url;  // No password in URL
+            }
+
+            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+        };
+
        // start the download
-        fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-                model_url, path_model, headers.etag, headers.last_modified);
+        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+                llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
        auto res = curl_easy_perform(curl);
        if (res != CURLE_OK) {
            fclose(outfile);
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
-            return NULL;
+            return false;
        }

        long http_code = 0;
@ -1812,7 +1873,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            fclose(outfile);
            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
-            return NULL;
+            return false;
        }

        // Clean up
@ -1824,7 +1885,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            if (etag_file) {
                fputs(headers.etag, etag_file);
                fclose(etag_file);
-                fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
+                fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
            }
        }

@ -1834,42 +1895,177 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
            if (last_modified_file) {
                fputs(headers.last_modified, last_modified_file);
                fclose(last_modified_file);
-                fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
+                fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
                        headers.last_modified);
            }
        }

-        if (rename(path_model_temporary, path_model) != 0) {
+        if (rename(path_temporary, path) != 0) {
            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
+            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+struct llama_model * llama_load_model_from_url(
+        const char * model_url,
+        const char * path_model,
+        const struct llama_model_params & params) {
+    // Basic validation of the model_url
+    if (!model_url || strlen(model_url) == 0) {
+        fprintf(stderr, "%s: invalid model_url\n", __func__);
+        return NULL;
+    }
+
+    // Initialize libcurl
+    auto * curl = curl_easy_init();
+
+    if (!curl) {
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        return NULL;
+    }
+
+    if (!curl) {
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        return NULL;
+    }
+
+    if (!llama_download_file(curl, model_url, path_model)) {
+        return NULL;
+    }
+
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
+        if (!ctx_gguf) {
+            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+            curl_easy_cleanup(curl);
+            return NULL;
+        }
+
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }
+
+        gguf_free(ctx_gguf);
+    }
+
+    curl_easy_cleanup(curl);
+
+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
+                fprintf(stderr, "\n%s: unexpected model file name: %s"
+                                " n_split=%d\n", __func__, path_model, n_split);
+                return NULL;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
+                fprintf(stderr, "\n%s: unexpected model url: %s"
+                                " n_split=%d\n", __func__, model_url, n_split);
                return NULL;
            }
        }

+        // Prepare download in parallel
+        std::vector<std::future<bool>> futures_download;
+        for (int idx = 1; idx < n_split; idx++) {
+            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+                char split_path[PATH_MAX] = {0};
+                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+
+                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+
+                auto * curl = curl_easy_init();
+                bool res = llama_download_file(curl, split_url, split_path);
                curl_easy_cleanup(curl);

+                return res;
+            }, idx));
+        }
+
+        // Wait for all downloads to complete
+        for (auto & f : futures_download) {
+            if (!f.get()) {
+                return NULL;
+            }
+        }
+    }
+
    return llama_load_model_from_file(path_model, params);
 }

+struct llama_model * llama_load_model_from_hf(
+        const char * repo,
+        const char * model,
+        const char * path_model,
+        const struct llama_model_params & params) {
+    // construct hugging face model url:
+    //
+    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
+    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+    //
+    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
+    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
+    //
+
+    std::string model_url = "https://huggingface.co/";
+    model_url += repo;
+    model_url += "/resolve/main/";
+    model_url += model;
+
+    return llama_load_model_from_url(model_url.c_str(), path_model, params);
+}
+
 #else

-struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
-                                              struct llama_model_params /*params*/) {
+struct llama_model * llama_load_model_from_url(
+        const char * /*model_url*/,
+        const char * /*path_model*/,
+        const struct llama_model_params & /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
    return nullptr;
 }

+struct llama_model * llama_load_model_from_hf(
+        const char * /*repo*/,
+        const char * /*model*/,
+        const char * /*path_model*/,
+        const struct llama_model_params & /*params*/) {
+    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return nullptr;
+}
+
 #endif // LLAMA_USE_CURL

 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto mparams = llama_model_params_from_gpt_params(params);

    llama_model * model = nullptr;
-    if (!params.model_url.empty()) {
+
+    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+    } else if (!params.model_url.empty()) {
        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
    } else {
        model = llama_load_model_from_file(params.model.c_str(), mparams);
    }
+
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
        return std::make_tuple(nullptr, nullptr);
--- a/common/common.h
+++ b/common/common.h
@ -89,9 +89,11 @@ struct gpt_params {
    struct llama_sampling_params sparams;

    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
-    std::string model_url         = ""; // model url to download
    std::string model_draft          = "";  // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
+    std::string model_url            = "";  // model url to download
+    std::string hf_repo              = "";  // HF repo
+    std::string hf_file              = "";  // HF file
    std::string prompt               = "";
    std::string prompt_file          = "";  // store the external prompt file name
    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
@ -99,6 +101,8 @@ struct gpt_params {
    std::string input_suffix         = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir               = "";  // directory in which to save YAML log files
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
    std::string logits_file          = "";  // file for saving *all* logits

    std::vector<llama_model_kv_override> kv_overrides;
@ -139,7 +143,7 @@ struct gpt_params {
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
-    bool cont_batching     = false; // insert new sequences for decoding on-the-fly
+    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
@ -167,6 +171,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
 std::string get_system_info(const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);
@ -192,8 +198,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
-                                                         struct llama_model_params     params);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);

 // Batch utils

@ -302,3 +308,10 @@ struct llama_control_vector_load_info {
 // Load control vectors, scale each by strength, and add them together.
 // On error, returns {-1, empty}
 llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
+
+//
+// Split utils
+//
+static const char * const LLM_KV_SPLIT_NO            = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -9,7 +9,7 @@
 #include <unordered_set>
 #include <vector>

-using json = nlohmann::json;
+using json = nlohmann::ordered_json;

 const std::string SPACE_RULE = "\" \"?";

@ -124,7 +124,7 @@ static std::string replacePattern(const std::string & input, const std::regex &
 }

 static std::string format_literal(const std::string & literal) {
-    std::string escaped = replacePattern(json(literal).dump(), GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
+    std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
        char c = match.str()[0];
        return GRAMMAR_LITERAL_ESCAPES.at(c);
    });
@ -137,7 +137,7 @@ private:
    std::function<json(const std::string &)> _fetch_json;
    bool _dotall;
    std::map<std::string, std::string> _rules;
-    std::unordered_map<std::string, nlohmann::json> _refs;
+    std::unordered_map<std::string, json> _refs;
    std::unordered_set<std::string> _refs_being_resolved;
    std::vector<std::string> _errors;
    std::vector<std::string> _warnings;
@ -413,7 +413,7 @@ private:
            std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
            prop_kv_rule_names[prop_name] = _add_rule(
                name + (name.empty() ? "" : "-") + prop_name + "-kv",
-                format_literal(prop_name) + " space \":\" space " + prop_rule_name
+                format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
            );
            if (required.find(prop_name) != required.end()) {
                required_props.push_back(prop_name);
@ -495,7 +495,7 @@ public:
        _rules["space"] = SPACE_RULE;
    }

-    void resolve_refs(nlohmann::json & schema, const std::string & url) {
+    void resolve_refs(json & schema, const std::string & url) {
        /*
        * Resolves all $ref fields in the given schema, fetching any remote schemas,
        * replacing each $ref with absolute reference URL and populates _refs with the
@ -557,11 +557,7 @@ public:
    }

    std::string _generate_constant_rule(const json & value) {
-        if (!value.is_string()) {
-            _errors.push_back("Only std::string constants are supported, got " + value.dump());
-            return "";
-        }
-        return format_literal(value.get<std::string>());
+        return format_literal(value.dump());
    }

    std::string visit(const json & schema, const std::string & name) {
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@ -1,4 +1,4 @@
 #pragma once
 #include "json.hpp"

-std::string json_schema_to_grammar(const nlohmann::json& schema);
+std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
--- a/common/log.h
+++ b/common/log.h
@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
    #define LOG_IMPL(str, ...)                                                                                      \
    do {                                                                                                            \
        if (LOG_TARGET != nullptr)                                                                                  \
@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
    do {                                                                                                                                \
        if (LOG_TARGET != nullptr)                                                                                                      \
@ -566,6 +566,7 @@ inline void log_print_usage()
    printf("  --log-new             Create a separate new log file on start. "
                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
    printf("  --log-append          Don't truncate the old log file.\n");
+    printf("\n");
 }

 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@ -0,0 +1,282 @@
+#include "ngram-cache.h"
+#include "common.h"
+#include "log.h"
+
+#include <cstdint>
+#include <fstream>
+
+void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
+                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
+    const int64_t t_start_ms = ggml_time_ms();
+    const int64_t inp_size = inp.size();
+
+    const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
+    int64_t n_done = 0;
+
+    for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
+        const int64_t i_start = std::max(inp_size - nnew, ngram_size);
+        for (int64_t i = i_start; i < inp_size; ++i) {
+            const int64_t ngram_start = i - ngram_size;
+            llama_ngram ngram(&inp[ngram_start], ngram_size);
+            const llama_token token = inp[i];
+
+            llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
+            if (part_it == ngram_cache.end()) {
+                llama_ngram_cache_part part;
+                part.emplace(token, 1);
+                ngram_cache.emplace(ngram, part);
+            } else {
+                llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
+                if (token_count_it == part_it->second.end()) {
+                    part_it->second.emplace(token, 1);
+                } else {
+                    token_count_it->second++;
+                }
+            }
+            ++n_done;
+
+            if (print_progress && n_done % 10000000 == 0) {
+                const int64_t t_now_ms = ggml_time_ms();
+                const int64_t eta_ms   = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done;
+                const int64_t eta_min  = eta_ms / (60*1000);
+                const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
+
+                fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
+            }
+        }
+    }
+}
+
+// Helper function to get a token from the combined, speculative sequence of inp and draft.
+static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
+    return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
+}
+
+// If sample size or percentage are below these thresholds the draft is aborted early:
+constexpr int    draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2,  2,  1,  1};
+constexpr int        draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
+constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
+constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
+
+// Helper function that tries to draft a token from only the static ngram cache:
+static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
+    llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+    if (part_static_it == nc_static.end()) {
+        return -1;
+    }
+    const llama_ngram_cache_part part_static = part_static_it->second;
+
+    int max_count_static  = 0;
+    int sum_count_static  = 0;
+    llama_token max_token = -1;
+
+    for (std::pair<llama_token, int> token_count_static : part_static) {
+        const llama_token token = token_count_static.first;
+        const int32_t count_static  = token_count_static.second;
+
+        if (count_static > max_count_static) {
+            max_token        = token;
+            max_count_static = count_static;
+        }
+        sum_count_static += count_static;
+    }
+
+    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
+        return -1;
+    }
+    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
+        return -1;
+    }
+    return max_token;
+}
+
+// Try to draft a token from primary cache (context/dynamic), validate with static cache:
+static llama_token try_draft(
+    llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
+    const int * min_sample_size, const int * min_percent) {
+
+    llama_token drafted_token = -1;
+
+    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
+        const llama_ngram ngram_primary = ngrams_primary[i];
+
+        llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
+        if (part_primary_it == nc_primary.end()) {
+            continue;
+        }
+        const llama_ngram_cache_part part_primary = part_primary_it->second;
+
+        int max_count_primary = 0;
+        int max_count_static  = 0;
+        int sum_count_primary = 0;
+        llama_token max_token = -1;
+
+        for (std::pair<llama_token, int> token_count_primary : part_primary) {
+            const llama_token token = token_count_primary.first;
+
+            llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
+
+            const int32_t count_primary = token_count_primary.second;
+            const int32_t count_static  = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
+
+            if (count_primary*count_static > max_count_primary*max_count_static) {
+                max_token         = token;
+                max_count_primary = count_primary;
+                max_count_static  = count_static;
+            }
+            sum_count_primary += count_primary;
+        }
+
+        if (sum_count_primary < min_sample_size[i]) {
+            continue;
+        }
+        if (100*max_count_primary < min_percent[i]*sum_count_primary) {
+            continue;;
+        }
+        drafted_token = max_token;
+    }
+
+    return drafted_token;
+}
+
+void llama_ngram_cache_draft(
+    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
+) {
+    GGML_ASSERT(draft.size() == 1);
+    const int inp_size = inp.size();
+
+    if (inp_size < LLAMA_NGRAM_STATIC) {
+        return;
+    }
+
+    while ((int) draft.size()-1 < n_draft) {
+        llama_token drafted_token = -1;
+
+        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
+        llama_ngram ngram_static;
+        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
+            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
+        }
+        llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+        llama_ngram_cache_part part_static;
+        if (part_static_it != nc_static.end()) {
+            part_static = part_static_it->second;
+        }
+
+        // cd = context + dynamic
+        std::vector<llama_ngram> ngrams_cd;
+        for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
+            const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
+            llama_ngram ngram_cd;
+            for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
+                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
+            }
+            ngrams_cd.push_back(ngram_cd);
+        }
+        if (drafted_token == -1) {
+            drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
+        }
+        if (drafted_token == -1) {
+            drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
+        }
+        if (drafted_token == -1) {
+            drafted_token = try_draft(nc_static, ngram_static);
+        }
+
+        if (drafted_token == -1) {
+            break;
+        }
+
+        LOG(" - draft candidate: token=%d\n", drafted_token);
+        draft.push_back(drafted_token);
+    }
+}
+
+void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
+    std::ofstream file_out(filename, std::ios::binary);
+    for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
+        const llama_ngram      ngram        = item.first;
+        llama_ngram_cache_part token_counts = item.second;
+        GGML_ASSERT(!token_counts.empty());
+        const int32_t ntokens = token_counts.size();
+        GGML_ASSERT(ntokens > 0);
+
+        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(llama_ngram));
+        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
+        for (std::pair<llama_token, int32_t> item2 : token_counts) {
+            const llama_token token = item2.first;
+            const int32_t     count = item2.second;
+            GGML_ASSERT(count > 0);
+
+            file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
+            file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
+        }
+    }
+
+}
+
+llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
+    std::ifstream hashmap_file(filename, std::ios::binary);
+    if (!hashmap_file) {
+        throw std::ifstream::failure("Unable to open file " + filename);
+    }
+    llama_ngram_cache ngram_cache;
+
+    llama_ngram ngram;
+    int32_t     ntokens;
+    llama_token token;
+    int32_t     count;
+
+    char * ngramc   = reinterpret_cast<char*>(&ngram);
+    char * ntokensc = reinterpret_cast<char*>(&ntokens);
+    char * tokenc   = reinterpret_cast<char*>(&token);
+    char * countc   = reinterpret_cast<char*>(&count);
+    while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
+        GGML_ASSERT(!hashmap_file.eof());
+        GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
+        GGML_ASSERT(ntokens > 0);
+        llama_ngram_cache_part token_counts;
+
+        for (int i = 0; i < ntokens; ++i) {
+            GGML_ASSERT(!hashmap_file.eof());
+            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
+            GGML_ASSERT(!hashmap_file.eof());
+            GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
+            GGML_ASSERT(count > 0);
+            token_counts.emplace(token, count);
+        }
+
+        ngram_cache.emplace(ngram, token_counts);
+    }
+    GGML_ASSERT(hashmap_file.eof());
+
+    return ngram_cache;
+}
+
+void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
+    for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
+        const llama_ngram      ngram = ngram_part.first;
+        llama_ngram_cache_part  part = ngram_part.second;
+
+        llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
+        if (part_merged_it == ngram_cache_target.end()) {
+            ngram_cache_target.emplace(ngram, part);
+            continue;
+        }
+
+        for (std::pair<llama_token, int32_t> token_count : part) {
+            const llama_token token = token_count.first;
+            const int32_t     count = token_count.second;
+            GGML_ASSERT(count > 0);
+
+            llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
+            if (token_count_merged_it == part_merged_it->second.end()) {
+                part_merged_it->second.emplace(token, count);
+                continue;
+            }
+
+            token_count_merged_it->second += count;
+        }
+    }
+}
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@ -0,0 +1,94 @@
+#pragma once
+
+#include "llama.h"
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#define LLAMA_NGRAM_MIN    1
+#define LLAMA_NGRAM_MAX    4
+#define LLAMA_NGRAM_STATIC 2
+
+// Data structures to map n-grams to empirical token probabilities:
+
+struct llama_ngram {
+    llama_token tokens[LLAMA_NGRAM_MAX];
+
+    llama_ngram() {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            tokens[i] = -1;
+        }
+    }
+
+    llama_ngram(const llama_token * input, const int ngram_size) {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            tokens[i] = i < ngram_size ? input[i] : -1;
+        }
+    }
+
+    bool operator==(const llama_ngram & other) const {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            if (tokens[i] != other.tokens[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+};
+
+struct llama_ngram_hash_function {
+    size_t operator()(const llama_ngram & ngram) const {
+        size_t hash = 0;
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
+        }
+        return hash;
+    }
+};
+
+// token -> number of times token has been seen
+typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
+
+// n-gram -> empirical distribution of following tokens
+typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
+
+
+// Update an ngram cache with tokens.
+// ngram_cache:         the cache to modify.
+// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
+// inp_data:            the token sequence with which to update ngram_cache.
+// nnew:                how many new tokens have been appended to inp_data since the last call to this function.
+// print_progress:      whether to print progress to stderr.
+//
+// In order to get correct results inp_data can ONLY BE APPENDED TO.
+// Changes in the middle need a complete rebuild.
+void llama_ngram_cache_update(
+    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+
+// Try to draft tokens from ngram caches.
+// inp:                the tokens generated so far.
+// draft:              the token sequence to draft. Expected to initially contain the previously sampled token.
+// n_draft:            maximum number of tokens to add to draft.
+// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
+// nc_context:         ngram cache based on current context.
+// nc_dynamic:         ngram cache based on previous user generations.
+// nc_static:          ngram cache generated from a large text corpus, used for validation.
+void llama_ngram_cache_draft(
+    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
+
+// Save an ngram cache to a file.
+// ngram_cache: the ngram cache to save.
+// filename:    the path under which to save the ngram cache.
+void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
+
+// Load an ngram cache saved with llama_ngram_cache_save.
+// filename: the path from which to load the ngram cache.
+// returns:  an ngram cache containing the information saved to filename.
+llama_ngram_cache llama_ngram_cache_load(std::string & filename);
+
+// Merge two ngram caches.
+// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
+// ngram_cache_add:    the ngram cache to add to ngram_cache_target.
+void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -168,77 +168,20 @@ static llama_token llama_sampling_sample_impl(
                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
    const llama_sampling_params & params = ctx_sampling->params;

-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
    const float   temp            = params.temp;
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;

+    std::vector<float> original_logits;
+    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
+    if (!is_resampling) {
+        GGML_ASSERT(!original_logits.empty());
+    }
    llama_token id = 0;
-
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
-
-    if (!is_resampling) {
-        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
-        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
-    }
-
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
-
-    cur.clear();
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    // If we are in the resampling phase, apply grammar checks before sampling logic
-    if (is_resampling && ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
-    }
-
    if (temp < 0.0) {
        // greedy sampling, with probs
        llama_sample_softmax(ctx_main, &cur_p);
@ -302,11 +245,13 @@ static llama_token llama_sampling_sample_impl(
    return id;
 }

-static llama_token_data_array llama_sample_probability_distribution_impl(
+static llama_token_data_array llama_sampling_prepare_impl(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx) {
+                  const int idx,
+                  bool apply_grammar,
+                  std::vector<float> * original_logits) {
    const llama_sampling_params & params = ctx_sampling->params;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
@ -315,6 +260,7 @@ static llama_token_data_array llama_sample_probability_distribution_impl(
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
    const float   penalty_present = params.penalty_present;
+
    const bool    penalize_nl     = params.penalize_nl;

    auto & prev = ctx_sampling->prev;
@ -323,8 +269,10 @@ static llama_token_data_array llama_sample_probability_distribution_impl(
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
+    if (apply_grammar && original_logits != NULL) {
+        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
+        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+    }

    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@ -364,12 +312,11 @@ static llama_token_data_array llama_sample_probability_distribution_impl(
        }
    }

-    // apply grammar checks
-    if (ctx_sampling->grammar != NULL) {
+    // apply grammar checks before sampling logic
+    if (apply_grammar && ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

-    llama_sample_softmax(ctx_main, &cur_p);
    return cur_p;
 }

@ -382,12 +329,14 @@ llama_token llama_sampling_sample(
    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }

-llama_token_data_array llama_sampling_probability_distribution(
+llama_token_data_array llama_sampling_prepare(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
+                  const int idx,
+                  bool apply_grammar,
+                  std::vector<float> * original_logits) {
+    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
 }

 void llama_sampling_accept(
--- a/common/sampling.h
+++ b/common/sampling.h
@ -131,12 +131,14 @@ llama_token llama_sampling_sample(
        struct llama_context * ctx_cfg,
        int idx = 0);

-// returns the probability that token of given id will be sampled
-llama_token_data_array llama_sampling_probability_distribution(
+// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
+llama_token_data_array llama_sampling_prepare(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        struct llama_context * ctx_cfg,
-        int idx = 0);
+        int idx = 0,
+        bool apply_grammar = true,
+        std::vector<float> * original_logits = nullptr);

 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -93,31 +93,42 @@ class Model(ABC):

        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
            self.gguf_writer.add_context_length(n_ctx)
+            print(f"gguf: context length = {n_ctx}")

        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        self.gguf_writer.add_embedding_length(n_embd)
+        print(f"gguf: embedding length = {n_embd}")

        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
+            print(f"gguf: feed forward length = {n_ff}")

        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_head_count(n_head)
+        print(f"gguf: head count = {n_head}")

        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)
+            print(f"gguf: key-value head count = {n_head_kv}")

        if (rope_theta := self.hparams.get("rope_theta")) is not None:
            self.gguf_writer.add_rope_freq_base(rope_theta)
+            print(f"gguf: rope theta = {rope_theta}")
        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
+            print(f"gguf: rms norm epsilon = {f_rms_eps}")
        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
+            print(f"gguf: layer norm epsilon = {f_norm_eps}")
        if (n_experts := self.hparams.get("num_local_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
+            print(f"gguf: expert count = {n_experts}")
        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
+            print(f"gguf: experts used count = {n_experts_used}")

        self.gguf_writer.add_file_type(self.ftype)
+        print(f"gguf: file type = {self.ftype}")

    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@ -1051,6 +1062,21 @@ class MixtralModel(Model):
        self._set_vocab_sentencepiece()


+@Model.register("GrokForCausalLM")
+class GrokModel(Model):
+    model_arch = gguf.MODEL_ARCH.GROK
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_name("Grok")
+
+
@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
    model_arch = gguf.MODEL_ARCH.MINICPM
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -34,6 +34,7 @@ else()
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
+    add_subdirectory(retrieval)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(passkey)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -48,6 +48,8 @@ int main(int argc, char ** argv) {
        params.prompt = "Hello my name is";
    }

+    process_escapes(params.prompt);
+
    // init LLM

    llama_backend_init();
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -21,6 +21,8 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface.

 `$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`

+Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
+
 Now you can use the model with a command like:

 `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
+#include "log.h"

 #include <unordered_map>
 #include <vector>
@ -78,111 +79,101 @@ typedef struct {

 struct TransformerWeights {
    // token embedding table
-    float* token_embedding_table;    // (vocab_size, dim)
+    std::vector<float> token_embedding_table;    // (vocab_size, dim)
    // weights for rmsnorms
-    float* rms_att_weight; // (layer, dim) rmsnorm weights
-    float* rms_ffn_weight; // (layer, dim)
+    std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
+    std::vector<float> rms_ffn_weight; // (layer, dim)
    // weights for matmuls
-    float* wq; // (layer, dim, dim)
-    float* wk; // (layer, dim, dim)
-    float* wv; // (layer, dim, dim)
-    float* wo; // (layer, dim, dim)
+    std::vector<float> wq; // (layer, dim, dim)
+    std::vector<float> wk; // (layer, dim, dim)
+    std::vector<float> wv; // (layer, dim, dim)
+    std::vector<float> wo; // (layer, dim, dim)
    // weights for ffn
-    float* w1; // (layer, hidden_dim, dim)
-    float* w2; // (layer, dim, hidden_dim)
-    float* w3; // (layer, hidden_dim, dim)
+    std::vector<float> w1; // (layer, hidden_dim, dim)
+    std::vector<float> w2; // (layer, dim, hidden_dim)
+    std::vector<float> w3; // (layer, hidden_dim, dim)
    // final rmsnorm
-    float* rms_final_weight; // (dim,)
+    std::vector<float> rms_final_weight; // (dim,)
    // freq_cis for RoPE relatively positional embeddings
-    // float* freq_cis_real; // (seq_len, dim/2)
-    // float* freq_cis_imag; // (seq_len, dim/2)
+    // std::vector<float> freq_cis_real; // (seq_len, dim/2)
+    // std::vector<float> freq_cis_imag; // (seq_len, dim/2)
    // (optional) classifier weights for the logits, on the last layer
-    float* wcls;
-
-    ~TransformerWeights() {
-        delete[] token_embedding_table;
-        delete[] rms_att_weight;
-        delete[] rms_ffn_weight;
-        delete[] wq;
-        delete[] wk;
-        delete[] wv;
-        delete[] wo;
-        delete[] w1;
-        delete[] w2;
-        delete[] w3;
-        delete[] rms_final_weight;
-        delete[] wcls;
-    }
+    std::vector<float> wcls;
 };

-static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
-    // we calloc instead of malloc to keep valgrind happy
-    w->token_embedding_table = new float[p->vocab_size * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
+    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
+    try {
+        w->token_embedding_table.resize(p->vocab_size * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);

-    w->rms_att_weight = new float[p->n_layers * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        w->rms_att_weight.resize(p->n_layers * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);

-    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        w->rms_ffn_weight.resize(p->n_layers * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);

-    w->wq = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wq.resize(p->n_layers * p->dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-    w->wk = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

-    w->wv = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

-    w->wo = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wo.resize(p->n_layers * p->dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

-    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);

-    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

-    w->rms_final_weight = new float[p->dim]();
-    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        w->rms_final_weight.resize(p->dim);
+        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);

        if (shared_weights) {
-        w->wcls = NULL;
+            w->wcls = {};
        } else {
-        w->wcls = new float[p->vocab_size * p->dim]();
-        printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+            w->wcls.resize(p->vocab_size * p->dim);
+            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        }
+    }
+    catch (std::length_error &) {
+        die("Invalid configuration. Failed to allocate memory for weights");
    }
 }

-static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
-    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
-    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
-    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
+    if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
+    if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
+    if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
+    if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
+    if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
+    if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
+    if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
+    if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
+    if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
+    if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
+    if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;

    // Skip freq_cis_real & freq_cis_imag
    int head_size = p->dim / p->n_heads;
    fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);

-    if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;

    // Check we didn't forget to read anything
    auto curr = ftell(f);
    fseek(f, 0, SEEK_END);
    auto end = ftell(f);
    if (curr != end) {
-        printf("Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", curr, end);
+        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
        return 1;
    }

@ -190,20 +181,20 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo
 }

 static void print_sample_weights(TransformerWeights *w){
-    printf("----- Quick print of first of the weight vales of all the variables\n");
-    printf("%f\n", w->token_embedding_table[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->rms_ffn_weight[0]);
+    LOG("----- Quick print of first of the weight vales of all the variables\n");
+    LOG("%f\n", w->token_embedding_table[0]);
+    LOG("%f\n", w->rms_att_weight[0]);
+    LOG("%f\n", w->rms_ffn_weight[0]);

-    printf("%f\n", w->wq[0]);
-    printf("%f\n", w->wk[0]);
-    printf("%f\n", w->wv[0]);
-    printf("%f\n", w->wo[0]);
-    printf("%f\n", w->w1[0]);
-    printf("%f\n", w->w2[0]);
-    printf("%f\n", w->w3[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    if (w->wcls) printf("%f\n", w->wcls[0]);
+    LOG("%f\n", w->wq[0]);
+    LOG("%f\n", w->wk[0]);
+    LOG("%f\n", w->wv[0]);
+    LOG("%f\n", w->wo[0]);
+    LOG("%f\n", w->w1[0]);
+    LOG("%f\n", w->w2[0]);
+    LOG("%f\n", w->w3[0]);
+    LOG("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////

@ -231,8 +222,10 @@ struct my_llama_hparams {
    uint32_t n_ff      = 11008;
    uint32_t n_mult    = 4;
    uint32_t n_head    = 32;
+    uint32_t n_head_kv = 32;
    uint32_t n_layer   = 32;
    uint32_t n_rot     = 64;
+
    bool operator!=(const my_llama_hparams& other) const {
        return memcmp(this, &other, sizeof(my_llama_hparams));
    }
@ -325,14 +318,30 @@ struct train_params {
 };

 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
-    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
+    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
+}
+
+static void print_tensor_info(const struct ggml_context * ctx) {
+    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        LOG("%s: Allocating ", __func__);
+        int64_t total = 1;
+        int i = 0;
+        for (; i < ggml_n_dims(t); ++i) {
+            if (i > 0) LOG("x ");
+            LOG("[%" PRId64 "] ", t->ne[i]);
+            total *= t->ne[i];
+        }
+        if (i > 1) LOG("= [%" PRId64 "] ", total);
+        LOG("float space for %s\n", ggml_get_name(t));
+    }
 }

 static void init_model(struct my_llama_model * model) {
@ -342,6 +351,8 @@ static void init_model(struct my_llama_model * model) {
    const uint32_t n_layer = hparams.n_layer;
    const uint32_t n_vocab = hparams.n_vocab;

+    const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
+
    const uint32_t n_ff = hparams.n_ff;
    struct ggml_context * ctx = model->ctx;

@ -350,25 +361,8 @@ static void init_model(struct my_llama_model * model) {
    model->train_tokens = 0;

    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
-
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
-
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
-
-    // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-
-    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
-
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);

    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
    ggml_set_name(model->norm,           "norm.weight");
@ -383,8 +377,8 @@ static void init_model(struct my_llama_model * model) {
        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);

        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@ -406,6 +400,8 @@ static void init_model(struct my_llama_model * model) {
        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
    }
+
+    print_tensor_info(ctx);
 }

 static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
@ -421,9 +417,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
 static void print_row(struct ggml_tensor * probs, int i) {
    for (int k = 0; k < probs->ne[0]; ++k) {
        float p = get_f32_2d(probs, k, i);
-        printf(" %f", p);
+        LOG(" %f", p);
    }
-    printf("\n");
+    LOG("\n");
 }

 static void print_matrix(struct ggml_tensor * probs) {
@ -431,33 +427,12 @@ static void print_matrix(struct ggml_tensor * probs) {
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
-            printf(" %.2f", p);
+            LOG(" %.2f", p);
        }
-        printf("\n");
+        LOG("\n");
    }
 }

-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
@ -549,8 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
    return out.str();
 }

-static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
    if (is_ggml_file(filename)) {
+        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
        struct ggml_context * ctx_data = NULL;

        struct gguf_init_params params = {
@ -578,6 +554,9 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);

        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
+        if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
+            die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
+        }

        vocab->id_to_token.resize(n_vocab);

@ -595,7 +574,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
        gguf_free(ctx);
    } else {
        // assume llama2.c vocabulary
-        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
+        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
            die_fmt("%s: %s", strerror(errno), filename);
@ -638,38 +617,15 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
 }

 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
-    int ct;
-    switch (ggml_n_dims(gg_weights)) {
-        case 1:
-            ct = 0;
-            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
-                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
-                *ptr = karpathy_weights[ct];
-                ct++;
+    int size = 1;
+    for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
+        size *= gg_weights->ne[dim];
    }
-            break;
-        case 2:
-            ct = 0;
-            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
-                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
-                    *ptr = karpathy_weights[ct];
-                    ct++;
-                }
-            }
-            break;
-        case 3:
-            ct = 0;
-            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
-                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
-                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
-                        *ptr = karpathy_weights[ct];
-                        ct++;
-                    }
-                }
-            }
-            break;
+    for (int ct = 0; ct < size; ++ct) {
+        int64_t i0 = 0; int64_t i1 = 0;
+        int64_t i2 = 0; int64_t i3 = 0;
+        ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
+        ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
    }
 }

@ -679,16 +635,18 @@ static void save_as_llama_model(
    // convert AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
    // float*                   -> struct ggml_tensor
-    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
-    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
+    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
+    convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());

-    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
+    convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
    //print_row(model->norm, 0);

    // for rms-att-weight
    int row_length = model->hparams.n_embd;
    int n_ff = model->hparams.n_ff;

+    const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
+
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
        auto & layer = model->layers[i];
        // 1d
@ -697,9 +655,10 @@ static void save_as_llama_model(

        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
+        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length/n_multiqueries]);
+        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length/n_multiqueries]);

        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
@ -736,8 +695,8 @@ static void save_as_llama_model(
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    // n_head_kv is optional, default to n_head
-    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
+    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
+    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
@ -916,19 +875,30 @@ int main(int argc, char ** argv) {
    if (!params_parse(argc, argv, &params)) {
        return 1;
    }
+    log_set_target(stdout);
    Config config;
    TransformerWeights weights = {};
    {
-        FILE *file = fopen(params.fn_llama2c_model, "rb");
-        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
+        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        FILE *file = fopen(params.fn_llama2c_model, "r");
+        if (!file) {
+            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            return 1;
+        }
        // read in the config header
-        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        if (fread(&config, sizeof(Config), 1, file) != 1) {
+            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            return 1;
+        }
        auto shared_weights = config.vocab_size > 0;
        config.vocab_size = abs(config.vocab_size);

        // read in the Transformer weights
-        malloc_weights(&weights, &config, shared_weights);
-        if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
+        alloc_weights(&weights, &config, shared_weights);
+        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
+            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            return 1;
+        }
        fclose(file);
    }

@ -942,9 +912,12 @@ int main(int argc, char ** argv) {
    model.hparams.n_ff      = config.hidden_dim;
    model.hparams.n_mult    = 32;//params.n_mult;
    model.hparams.n_head    = config.n_heads; //params.n_head;
+    model.hparams.n_head_kv = config.n_kv_heads;
    model.hparams.n_layer   = config.n_layers; //params.n_layer;
    model.hparams.n_rot     = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+
    print_params(&model.hparams);
+
    struct ggml_init_params lcparams;
    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
    lcparams.mem_buffer = NULL;
@ -956,7 +929,7 @@ int main(int argc, char ** argv) {
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);

-    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);

    ggml_free(model.ctx);
    return 0;
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@ -1,32 +1,31 @@
 #include "llama.h"
-#include "ggml.h"
 #include "common.h"

 #include <algorithm>
 #include <cmath>
-#include <cstdint>
 #include <cstdlib>
 #include <fstream>
-#include <ios>
 #include <string>
 #include <vector>

 #include <stdio.h>
-#include <fcntl.h>
 #include <string.h>
+#include <climits>
+#include <stdexcept>
+
+#if defined(_WIN32)
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif

 enum split_operation : uint8_t {
    SPLIT_OP_SPLIT,
    SPLIT_OP_MERGE,
 };

-static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split";
-static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count";
-
-static const int SPLIT_FILENAME_MAX = 256;
-
-static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf";
-
 struct split_params {
    split_operation operation = SPLIT_OP_SPLIT;
    int n_split_tensors = 128;
@ -116,13 +115,13 @@ static bool split_params_parse(int argc, const char ** argv, split_params & para
    try {
        if (!split_params_parse_ex(argc, argv, params)) {
            split_print_usage(argv[0]);
-            exit(1);
+            exit(EXIT_FAILURE);
        }
    }
    catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        split_print_usage(argv[0]);
-        exit(1);
+        exit(EXIT_FAILURE);
    }
    return result;
 }
@ -134,12 +133,6 @@ static void zeros(std::ofstream & file, size_t n) {
    }
 }

-static std::string split_file_name(const std::string & path, int i_split, int n_split) {
-    char f_split[SPLIT_FILENAME_MAX] = {0};
-    snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split);
-    return std::string(f_split);
-}
-
 struct split_strategy {
    const split_params params;
    std::ifstream & f_input;
@ -180,8 +173,9 @@ struct split_strategy {
        if (i_split == 0) {
            gguf_set_kv(ctx_out, ctx_gguf);
        }
-        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split);
-        gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
+        gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);

        // populate the original tensors, so we get an initial metadata
        for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
@ -189,10 +183,11 @@ struct split_strategy {
            gguf_add_tensor(ctx_out, meta);
        }

-        auto split_name = split_file_name(params.output, i_split, n_split);
+        char split_path[PATH_MAX] = {0};
+        llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);

-        fprintf(stderr, "%s: %s ...", __func__, split_name.c_str());
-        fout = std::ofstream(split_name, std::ios::binary);
+        fprintf(stderr, "%s: %s ...", __func__, split_path);
+        fout = std::ofstream(split_path, std::ios::binary);
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors

        auto meta_size = gguf_get_meta_size(ctx_out);
@ -250,19 +245,23 @@ static void gguf_split(const split_params & split_params) {
    std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
    if (!f_input.is_open()) {
        fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(1);
+        exit(EXIT_FAILURE);
    }

    auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
    if (!ctx_gguf) {
        fprintf(stderr, "%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-        exit(1);
+        exit(EXIT_FAILURE);
    }

    split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
+
+    char first_split_path[PATH_MAX] = {0};
+    llama_split_path(first_split_path, sizeof(first_split_path),
+                     split_params.output.c_str(), strategy.i_split, strategy.n_split);
    fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
            __func__, split_params.input.c_str(),
-            split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(),
+            first_split_path,
            split_params.n_split_tensors);

    strategy.split_start();
@ -298,7 +297,9 @@ static void gguf_merge(const split_params & split_params) {
    std::vector<ggml_context *> ctx_metas;
    std::vector<gguf_context *> ctx_ggufs;

-    std::string split_prefix;
+    char split_path[PATH_MAX] = {0};
+    strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
+    char split_prefix[PATH_MAX] = {0};

    // First pass to find KV and tensors metadata
    for (int i_split = 0; i_split < n_split; i_split++) {
@ -309,89 +310,66 @@ static void gguf_merge(const split_params & split_params) {
            /*.ctx      = */ &ctx_meta,
        };

-        auto split_name = split_params.input;
        if (i_split > 0) {
-            split_name = split_file_name(split_prefix, i_split, n_split);
+            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
        }
-        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str());
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);

-        auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params);
+        auto * ctx_gguf = gguf_init_from_file(split_path, params);
        if (!ctx_gguf) {
            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
-            exit(1);
+            exit(EXIT_FAILURE);
        }
        ctx_ggufs.push_back(ctx_gguf);
        ctx_metas.push_back(ctx_meta);

        if (i_split == 0) {
-            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT);
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
            if (key_n_split < 0) {
                fprintf(stderr,
                        "\n%s: input file does not contain %s metadata\n",
                        __func__,
-                        LLM_KV_GENERAL_SPLIT_N_SPLIT);
+                        LLM_KV_SPLIT_COUNT);
                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                gguf_free(ctx_out);
                fout.close();
-                exit(1);
+                exit(EXIT_FAILURE);
            }

-            n_split = gguf_get_val_u8(ctx_gguf, key_n_split);
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
            if (n_split < 1) {
                fprintf(stderr,
                        "\n%s: input file does not contain a valid split count %d\n",
                        __func__,
                        n_split);
                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
                gguf_free(ctx_out);
                fout.close();
-                exit(1);
+                exit(EXIT_FAILURE);
+            }
+
+            // Verify the file naming and extract split_prefix
+            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s"
+                                " i_split=%d"
+                                " n_split=%d\n", __func__,
+                        split_path, i_split, n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                fout.close();
+                exit(EXIT_FAILURE);
            }

            // Do not trigger merge if we try to merge again the output
-            gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0);
+            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);

            // Set metadata from the first split
            gguf_set_kv(ctx_out, ctx_gguf);
        }

-        // Verify the file naming
-        {
-            int i_split_file = 0;
-            int n_split_file = 0;
-            const char * i_split_format = "-00000-of-00000.gguf";
-
-            if (split_name.size() < strlen(i_split_format)) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str());
-                for (auto * _ctx_gguf : ctx_ggufs) {
-                    gguf_free(_ctx_gguf);
-                }
-                gguf_free(ctx_out);
-                fout.close();
-                exit(1);
-            }
-
-            split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format));
-
-            const char * split_name_c_str = split_name.c_str();
-            int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file);
-
-            if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) {
-                fprintf(stderr, "\n%s: unexpected input file name: %s"
-                                " i_split=%d i_split_file=%d"
-                                " n_split=%d n_split_file=%d\n", __func__,
-                        split_params.input.c_str(),
-                        i_split, i_split_file,
-                        n_split, n_split_file);
-                for (auto * _ctx_gguf : ctx_ggufs) {
-                    gguf_free(_ctx_gguf);
-                }
-                gguf_free(ctx_out);
-                fout.close();
-                exit(1);
-            }
-        }
-
        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
@ -411,18 +389,19 @@ static void gguf_merge(const split_params & split_params) {

    // Write tensors data
    for (int i_split = 0; i_split < n_split; i_split++) {
-        auto split_name = split_file_name(split_prefix, i_split, n_split);
-        std::ifstream f_input(split_name.c_str(), std::ios::binary);
+        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        std::ifstream f_input(split_path, std::ios::binary);
        if (!f_input.is_open()) {
-            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_name.c_str());
-            for (auto * _ctx_gguf : ctx_ggufs) {
-                gguf_free(_ctx_gguf);
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
+            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
+                gguf_free(ctx_ggufs[i]);
+                ggml_free(ctx_metas[i]);
            }
            gguf_free(ctx_out);
            fout.close();
-            exit(1);
+            exit(EXIT_FAILURE);
        }
-        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str());
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);

        auto * ctx_gguf = ctx_ggufs[i_split];
        auto * ctx_meta = ctx_metas[i_split];
@ -482,7 +461,7 @@ int main(int argc, const char ** argv) {
        case SPLIT_OP_MERGE: gguf_merge(params);
            break;
        default: split_print_usage(argv[0]);
-            exit(1);
+            exit(EXIT_FAILURE);
    }

    return 0;
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -50,17 +50,11 @@ private:
    void keep_imatrix(int ncall) const;
 };

-bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
-    GGML_UNUSED(user_data);
-
-    const struct ggml_tensor * src0 = t->src[0];
-    const struct ggml_tensor * src1 = t->src[1];
-
-    std::string wname;
-    {
 // remove any prefix and suffixes from the name
 // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
-        const char * p = strchr(src0->name, '#');
+static std::string filter_tensor_name(const char * name) {
+    std::string wname;
+    const char * p = strchr(name, '#');
    if (p != NULL) {
        p = p + 1;
        const char * q = strchr(p, '#');
@ -70,10 +64,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            wname = p;
        }
    } else {
-            wname = src0->name;
+        wname = name;
    }
+    return wname;
 }

+bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    GGML_UNUSED(user_data);
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+    std::string wname = filter_tensor_name(src0->name);
+
    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
    if (ask) {
@ -112,6 +114,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        // this is necessary to guarantee equal number of "ncall" for each tensor
        for (int ex = 0; ex < n_as; ++ex) {
            src0 = t->src[2 + ex];
+            wname = filter_tensor_name(src0->name);
            auto& e = m_stats[wname];
            if (e.values.empty()) {
                e.values.resize(src1->ne[0], 0);
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@ -61,7 +61,7 @@ class SchemaConverter:

    def _format_literal(self, literal):
        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
        )
        return f'"{escaped}"'

@ -308,8 +308,7 @@ class SchemaConverter:
        return ref_name

    def _generate_constant_rule(self, value):
-        assert isinstance(value, str), f'Only string constants are supported, got {value}'
-        return self._format_literal(value)
+        return self._format_literal(json.dumps(value))

    def visit(self, schema, name):
        schema_type = schema.get('type')
@ -428,7 +427,7 @@ class SchemaConverter:
            prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
            prop_kv_rule_names[prop_name] = self._add_rule(
                f'{name}{"-" if name else ""}{prop_name}-kv',
-                fr'{self._format_literal(prop_name)} space ":" space {prop_rule_name}'
+                fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
            )
        required_props = [k for k in sorted_props if k in required]
        optional_props = [k for k in sorted_props if k not in required]
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@ -3,3 +3,21 @@ add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET lookup-create)
+add_executable(${TARGET} lookup-create.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET lookup-merge)
+add_executable(${TARGET} lookup-merge.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET lookup-stats)
+add_executable(${TARGET} lookup-stats.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@ -0,0 +1,43 @@
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+int main(int argc, char ** argv){
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;
+
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    GGML_ASSERT(model != nullptr);
+
+    // tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+
+    std::vector<llama_token> inp;
+    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    fprintf(stderr, "%s: tokenization done\n", __func__);
+
+
+    llama_ngram_cache ngram_cache;
+    llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
+    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
+
+    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
+}
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@ -0,0 +1,47 @@
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+static void print_usage() {
+    fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
+    fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
+}
+
+int main(int argc, char ** argv){
+    if (argc < 3) {
+        print_usage();
+        exit(1);
+    }
+
+    std::vector<std::string> args;
+    args.resize(argc-1);
+    for (int i = 0; i < argc-1; ++i) {
+        args[i] = argv[i+1];
+        if (args[i] == "-h" || args[i] == "--help") {
+            print_usage();
+            exit(0);
+        }
+    }
+
+    fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
+    llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
+
+    for (size_t i = 1; i < args.size()-1; ++i) {
+        fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
+        llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
+
+        llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
+    }
+
+    fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
+    llama_ngram_cache_save(ngram_cache_merged, args.back());
+}
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -0,0 +1,163 @@
+#include "ggml.h"
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+#include "ngram-cache.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+int main(int argc, char ** argv){
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+
+    const int n_draft = params.n_draft;
+
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;
+
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_set_rng_seed(ctx, params.seed);
+    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
+
+    // tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+    LOG("add_bos tgt: %d\n", add_bos);
+
+    std::vector<llama_token> inp;
+    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+
+    llama_ngram_cache ngram_cache_context;
+    llama_ngram_cache ngram_cache_dynamic;
+    llama_ngram_cache ngram_cache_static;
+    int64_t t_draft_flat_us = 0;
+    int64_t t_draft_us = 0;
+
+    {
+        const int64_t t_start_draft_us = ggml_time_us();
+
+        if (!params.lookup_cache_static.empty()) {
+            try {
+                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
+            } catch (std::ifstream::failure const &) {
+                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                exit(1);
+            }
+        }
+
+        if (!params.lookup_cache_dynamic.empty()) {
+            try {
+                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
+            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+        }
+
+        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
+    }
+
+    const int n_input = inp.size();
+    const int n_ctx = params.n_ctx;
+
+    int n_drafted = 0;
+    int n_accept  = 0;
+
+    const int64_t t_start_ms = ggml_time_ms();
+
+    // Iterate over input tokens in chunks of size n_ctx.
+    // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
+    for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
+        const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
+        std::vector<llama_token> pseudo_output;
+        pseudo_output.push_back(inp_slice[0]);
+
+        while ((int) pseudo_output.size() < n_ctx) {
+            // Simulate drafting and decoding from draft:
+            std::vector<llama_token> draft;
+            draft.push_back(pseudo_output.back());
+
+            {
+                const int64_t t_start_draft_us = ggml_time_us();
+                llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+                t_draft_us += ggml_time_us() - t_start_draft_us;
+            }
+
+            n_drafted += draft.size() - 1;
+
+            for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
+                const llama_token ground_truth = inp_slice[pseudo_output.size()];
+                const llama_token drafted = draft[j];
+
+                if (ground_truth != drafted) {
+                    break;
+                }
+
+                ++n_accept;
+                pseudo_output.push_back(ground_truth);
+
+                {
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }
+            }
+
+            // After each simulated batch decoding simulate the sampling of a single token:
+            if ((int) pseudo_output.size() < n_ctx) {
+                pseudo_output.push_back(inp_slice[pseudo_output.size()]);
+                {
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }
+            }
+
+            draft.erase(draft.begin());
+
+        }
+        if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
+            const int64_t t_now_ms = ggml_time_ms();
+            const int64_t eta_ms   = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
+            const int64_t eta_min  = eta_ms / (60*1000);
+            const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
+
+            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+        }
+
+        // After each chunk, update the dynamic ngram cache with the context ngram cache:
+        llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
+        ngram_cache_context.clear();
+    }
+
+    LOG_TEE("\n");
+
+    LOG_TEE("\n");
+    LOG_TEE("n_draft      = %d\n", n_draft);
+    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
+    LOG_TEE("n_drafted    = %d\n", n_drafted);
+    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
+    LOG_TEE("n_accept     = %d\n", n_accept);
+    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -1,12 +1,15 @@
-#include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"

 #include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <vector>
+#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;
@ -15,11 +18,7 @@ int main(int argc, char ** argv){
        return 1;
    }

-    // max/min n-grams size to search for in prompt
-    const int ngram_max = 4;
-    const int ngram_min = 1;
-
-    // length of the candidate / draft sequence, if match is found
+    // max. number of additional tokens to draft if match is found
    const int n_draft = params.n_draft;

    const bool dump_kv_cache = params.dump_kv_cache;
@ -39,6 +38,8 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_set_rng_seed(ctx, params.seed);
+    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
    const bool add_bos = llama_should_add_bos_token(model);
@ -47,6 +48,35 @@ int main(int argc, char ** argv){
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);

+    llama_ngram_cache ngram_cache_context;
+    llama_ngram_cache ngram_cache_dynamic;
+    llama_ngram_cache ngram_cache_static;
+    int64_t t_draft_flat_us = 0;
+    int64_t t_draft_us = 0;
+
+    {
+        // Fill up context ngram cache with tokens from user input:
+        const int64_t t_start_draft_us = ggml_time_us();
+        llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
+
+        if (!params.lookup_cache_static.empty()) {
+            try {
+                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
+            } catch (std::ifstream::failure const &) {
+                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                exit(1);
+            }
+        }
+
+        if (!params.lookup_cache_dynamic.empty()) {
+            try {
+                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
+            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+        }
+
+        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
+    }
+
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;

@ -76,8 +106,6 @@ int main(int argc, char ** argv){
    int n_drafted = 0;
    int n_accept  = 0;

-    int64_t t_draft_us = 0;
-
    int n_past = inp.size();

    bool has_eos = false;
@ -129,6 +157,12 @@ int main(int argc, char ** argv){
                ++n_past;
                ++i_dft;
                inp.push_back(id);
+                {
+                    // Update context ngram cache with the newly accepted token:
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }

                if (params.use_color) {
                    // color accepted draft token
@ -149,6 +183,12 @@ int main(int argc, char ** argv){
            draft.clear();
            draft.push_back(id);
            inp.push_back(id);
+            {
+                // Update context ngram cache with the newly accepted token:
+                const int64_t t_start_draft_us = ggml_time_us();
+                llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                t_draft_us += ggml_time_us() - t_start_draft_us;
+            }
            break;
        }

@ -163,44 +203,19 @@ int main(int argc, char ** argv){
        llama_batch_clear(batch_tgt);
        llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);

-        // generate n_pred tokens through prompt lookup
-        auto prompt_lookup = [&]() -> void {
-            const int inp_size = inp.size();
-            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
-                const llama_token * ngram = &inp[inp_size - ngram_size];
-
-                for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
-                    bool match = true;
-                    for (int j = 0; j < ngram_size; ++j) {
-                        if (inp[i + j] != ngram[j]) {
-                            match = false;
-                            break;
-                        }
-                    }
-
-                    if (match) {
-                        const int startIdx = i + ngram_size;
-                        const int endIdx = startIdx + n_draft;
-                        if (endIdx < inp_size) {
-                            for (int j = startIdx; j < endIdx; ++j) {
-                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
-                                draft.push_back(inp[j]);
-                                llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
-                                ++n_drafted;
-                            }
-                            return;
-                        }
-                    }
-                }
-            }
-            return;
-        };
-
+        // Draft already contains a single token sampled from the model:
+        GGML_ASSERT(draft.size() == 1);
+        GGML_ASSERT(draft[0] == inp.back());
        const int64_t t_start_draft_us = ggml_time_us();

-        prompt_lookup();
+        llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+
+        for (size_t i = 1; i < draft.size(); ++i) {
+            llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+        }

        t_draft_us += ggml_time_us() - t_start_draft_us;
+        n_drafted += draft.size() - 1;

        llama_decode(ctx, batch_tgt);
        ++n_past;
@ -210,6 +225,10 @@ int main(int argc, char ** argv){

    auto t_dec_end = ggml_time_us();

+    // Update dynamic ngram cache with context ngram cache and save it to disk:
+    llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
+    llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
+
    LOG_TEE("\n\n");

    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
@ -219,6 +238,7 @@ int main(int argc, char ** argv){
    LOG_TEE("n_draft      = %d\n", n_draft);
    LOG_TEE("n_predict    = %d\n", n_predict);
    LOG_TEE("n_drafted    = %d\n", n_drafted);
+    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
    LOG_TEE("n_accept     = %d\n", n_accept);
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -189,6 +189,18 @@ static void prepare_imatrix(const std::string& imatrix_file,
    }
 }

+static ggml_type parse_ggml_type(const char * arg) {
+    ggml_type result = GGML_TYPE_COUNT;
+    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
+        auto type = ggml_type(j);
+        const auto * name = ggml_type_name(type);
+        if (name && strcmp(arg, name) == 0) {
+            result = type; break;
+        }
+    }
+    return result;
+}
+
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@ -203,6 +215,18 @@ int main(int argc, char ** argv) {
    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
            params.quantize_output_tensor = false;
+        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
            params.allow_requantize = true;
        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET retrieval)
+add_executable(${TARGET} retrieval.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@ -0,0 +1,69 @@
+# llama.cpp/examples/retrieval
+
+Demonstration of simple retrieval technique based on cosine similarity
+
+More info:
+https://github.com/ggerganov/llama.cpp/pull/6193
+
+### How to use
+
+`retieval.cpp` has parameters of its own:
+- `--context-file`: file to be embedded - state this option multiple times to embed multiple files
+- `--chunk-size`: minimum size of each text chunk to be embedded
+- `--chunk-separator`: STRING to divide chunks by. newline by default
+
+`retrieval` example can be tested as follows:
+
+```bash
+make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
+```
+
+This chunks and embeds all given files and starts a loop requesting query inputs:
+
+```
+Enter query:
+```
+
+On each query input, top k chunks are shown along with file name, chunk position within file and original text:
+
+```
+Enter query: describe the mit license
+batch_decode: n_tokens = 6, n_seq = 1
+Top 3 similar chunks:
+filename: README.md
+filepos: 119
+similarity: 0.762334
+textdata:
+png)
+
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+
+[Roadmap](https://github.
+--------------------
+filename: License
+filepos: 0
+similarity: 0.725146
+textdata:
+MIT License
+
+Copyright (c) 2023 Georgi Gerganov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+--------------------
+filename: README.md
+filepos: 9178
+similarity: 0.621722
+textdata:
+com/cztomsik/ava) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
+- [pythops/tenere](https://github.
+--------------------
+```
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -0,0 +1,350 @@
+#include "common.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <fstream>
+
+struct retrieval_params {
+    std::vector<std::string> context_files; // context files to embed
+    int32_t chunk_size            = 64;     // chunk size for context embedding
+    std::string chunk_separator   = "\n";   // chunk separator for context embedding
+};
+
+static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
+    gpt_print_usage(argc, argv, gpt_params);
+    printf("retrieval options:\n");
+    printf("  --context-file FNAME  file containing context to embed.\n");
+    printf("                        specify multiple files by providing --context-file option multiple times.\n");
+    printf("  --chunk-size N        minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
+    printf("  --chunk-separator STRING\n");
+    printf("                        string to separate chunks (default: \"\\n\")\n");
+    printf("\n");
+}
+
+static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
+    int i = 1;
+    std::string arg;
+    while (i < argc) {
+        arg = argv[i];
+        bool invalid_gpt_param = false;
+        if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
+            if (invalid_gpt_param) {
+                fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            // option was parsed by gpt_params_find_arg
+        } else if (arg == "--context-file") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --context-file\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            // store the external file name in params
+            retrieval_params.context_files.push_back(argv[i]);
+        } else if (arg == "--chunk-size") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --chunk-size\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            retrieval_params.chunk_size = std::stoi(argv[i]);
+        } else if (arg == "--chunk-separator") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --chunk-separator\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            retrieval_params.chunk_separator = argv[i];
+        } else {
+            // unknown argument
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+            exit(1);
+        }
+        i++;
+    }
+}
+
+struct chunk {
+    // filename
+    std::string filename;
+    // original file position
+    size_t filepos;
+    // original text data
+    std::string textdata = "";
+    // tokenized text data
+    std::vector<llama_token> tokens;
+    // embedding
+    std::vector<float> embedding;
+};
+
+// chunk file data to chunks of size >= chunk_size
+// chunk_separator is the separator between chunks
+static std::vector<chunk> chunk_file(const std::string & filename, int chunk_size, const std::string & chunk_separator) {
+    std::vector<chunk> chunks;
+    std::ifstream f(filename.c_str());
+
+    if (!f.is_open()) {
+        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
+        return chunks;
+    }
+
+    chunk current_chunk;
+    char buffer[1024];
+    int64_t filepos = 0;
+    std::string current = "";
+    while (f.read(buffer, 1024)) {
+        current += std::string(buffer, f.gcount());
+        size_t pos;
+        while ((pos = current.find(chunk_separator)) != std::string::npos) {
+            current_chunk.textdata += current.substr(0, pos + chunk_separator.size());
+            if ((int) current_chunk.textdata.size() > chunk_size) {
+                // save chunk
+                current_chunk.filepos = filepos;
+                current_chunk.filename = filename;
+                chunks.push_back(current_chunk);
+                // update filepos
+                filepos += (int) current_chunk.textdata.size();
+                // reset current_chunk
+                current_chunk = chunk();
+            }
+            current = current.substr(pos + chunk_separator.size());
+        }
+
+    }
+    // add leftover data to last chunk
+    if (current_chunk.textdata.size() > 0) {
+        if (chunks.empty()) {
+            current_chunk.filepos = filepos;
+            current_chunk.filename = filename;
+            chunks.push_back(current_chunk);
+        } else {
+            chunks.back().textdata += current_chunk.textdata;
+        }
+    }
+    f.close();
+    return chunks;
+}
+
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
+    for (size_t i = 0; i < tokens.size(); i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+    }
+}
+
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+    // clear previous kv_cache values (irrelevant for embeddings)
+    llama_kv_cache_clear(ctx);
+
+    // run model
+    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    if (llama_decode(ctx, batch) < 0) {
+        fprintf(stderr, "%s : failed to decode\n", __func__);
+    }
+
+    for (int i = 0; i < batch.n_tokens; i++) {
+        if (!batch.logits[i]) {
+            continue;
+        }
+
+        // try to get sequence embeddings - supported only when pooling_type is not NONE
+        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+        if (embd == NULL) {
+            embd = llama_get_embeddings_ith(ctx, i);
+            if (embd == NULL) {
+                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+                continue;
+            }
+        }
+
+        float * out = output + batch.seq_id[i][0] * n_embd;
+        llama_embd_normalize(embd, out, n_embd);
+    }
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    retrieval_params retrieval_params;
+
+    retrieval_params_parse(argc, argv, params, retrieval_params);
+
+    // For BERT models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;
+
+    if (retrieval_params.chunk_size <= 0) {
+        fprintf(stderr, "chunk_size must be positive\n");
+        return 1;
+    }
+    if (retrieval_params.context_files.empty()) {
+        fprintf(stderr, "context_files must be specified\n");
+        return 1;
+    }
+    params.embedding = true;
+
+    print_build_info();
+
+    printf("processing files:\n");
+    for (auto & context_file : retrieval_params.context_files) {
+        printf("%s\n", context_file.c_str());
+    }
+
+    std::vector<chunk> chunks;
+    for (auto & context_file : retrieval_params.context_files) {
+        std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
+        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
+    }
+    printf("Number of chunks: %ld\n", chunks.size());
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+
+    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if (n_ctx > n_ctx_train) {
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+    }
+
+    // max batch size
+    const uint64_t n_batch = params.n_batch;
+    GGML_ASSERT(params.n_batch >= params.n_ctx);
+
+    // tokenize the prompts and trim
+    for (auto & chunk : chunks) {
+        auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
+        if (inp.size() > n_batch) {
+            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+                    __func__, (long long int) inp.size(), (long long int) n_batch);
+            return 1;
+        }
+        // add eos if not present
+        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+            inp.push_back(llama_token_eos(model));
+        }
+        chunk.tokens = inp;
+    }
+
+    // tokenization stats
+    if (params.verbose_prompt) {
+        for (int i = 0; i < (int) chunks.size(); i++) {
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
+                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+            }
+            fprintf(stderr, "\n\n");
+        }
+    }
+
+    // initialize batch
+    const int n_chunks = chunks.size();
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
+    // allocate output
+    const int n_embd = llama_n_embd(model);
+    std::vector<float> embeddings(n_chunks * n_embd, 0);
+    float * emb = embeddings.data();
+
+    // break into batches
+    int p = 0; // number of prompts processed already
+    int s = 0; // number of prompts in current batch
+    for (int k = 0; k < n_chunks; k++) {
+        // clamp to n_batch tokens
+        auto & inp = chunks[k].tokens;
+
+        const uint64_t n_toks = inp.size();
+
+        // encode if at capacity
+        if (batch.n_tokens + n_toks > n_batch) {
+            float * out = emb + p * n_embd;
+            batch_decode(ctx, batch, out, s, n_embd);
+            llama_batch_clear(batch);
+            p += s;
+            s = 0;
+        }
+
+        // add to batch
+        batch_add_seq(batch, inp, s);
+        s += 1;
+    }
+
+    // final batch
+    float * out = emb + p * n_embd;
+    batch_decode(ctx, batch, out, s, n_embd);
+
+    // save embeddings to chunks
+    for (int i = 0; i < n_chunks; i++) {
+        chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
+        // clear tokens as they are no longer needed
+        chunks[i].tokens.clear();
+    }
+
+    // start loop, receive query and return top k similar chunks based on cosine similarity
+    std::string query;
+    while (true) {
+        printf("Enter query: ");
+        std::getline(std::cin, query);
+        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
+
+        struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
+        batch_add_seq(query_batch, query_tokens, 0);
+
+        std::vector<float> query_emb(n_embd, 0);
+        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
+
+        llama_batch_clear(query_batch);
+
+        // compute cosine similarities
+        {
+            std::vector<std::pair<int, float>> similarities;
+            for (int i = 0; i < n_chunks; i++) {
+                float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
+                similarities.push_back(std::make_pair(i, sim));
+            }
+
+            // sort similarities
+            std::sort(similarities.begin(), similarities.end(), [](const std::pair<int, float> & a, const std::pair<int, float> & b) {
+                return a.second > b.second;
+            });
+
+            printf("Top %d similar chunks:\n", params.sparams.top_k);
+            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
+                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                printf("similarity: %f\n", similarities[i].second);
+                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                printf("--------------------\n");
+            }
+        }
+    }
+
+    // clean up
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    llama_free_model(model);
+    llama_backend_free();
+}
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -16,17 +16,20 @@ The project is under active development, and we are [looking for feedback and co

 **Command line options:**

- `--threads N`, `-t N`: Set the number of threads to use during generation.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
+- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching, this parameter is used only if one token is to be processed on CPU backend.
+- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
+- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
+- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
+- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
+- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
 - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
@ -57,7 +60,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
 - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
 - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, default: enabled.
+- `--log-disable`: Output logs to stdout only, not to `llama.log`. default: enabled.
 - `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)

 **If compiled with `LLAMA_SERVER_SSL=ON`**
@ -260,7 +263,7 @@ node index.js

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)

@ -357,7 +360,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)

- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.

    *Options:*

--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -48,7 +48,7 @@ export class SchemaConverter {
  }

  _formatLiteral(literal) {
-    const escaped = JSON.stringify(literal).replace(
+    const escaped = literal.replace(
      GRAMMAR_LITERAL_ESCAPE_RE,
      m => GRAMMAR_LITERAL_ESCAPES[m]
    );
@ -327,10 +327,7 @@ export class SchemaConverter {
  }

  _generateConstantRule(value) {
-    if (typeof value !== 'string') {
-      throw new Error('Only string constants are supported, got ' + JSON.stringify(value));
-    }
-    return this._formatLiteral(value);
+    return this._formatLiteral(JSON.stringify(value));
  }

  visit(schema, name) {
@ -346,9 +343,6 @@ export class SchemaConverter {
    } else if (Array.isArray(schemaType)) {
      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
    } else if ('const' in schema) {
-      if (typeof schema.const !== 'string') {
-        throw new Error('Only string constants are supported, got ' + JSON.stringify(schema.const));
-      }
      return this._addRule(ruleName, this._generateConstantRule(schema.const));
    } else if ('enum' in schema) {
      const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
@ -457,7 +451,7 @@ export class SchemaConverter {
      const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
      propKvRuleNames[propName] = this._addRule(
        `${name ?? ''}${name ? '-' : ''}${propName}-kv`,
-        `${this._formatLiteral(propName)} space ":" space ${propRuleName}`
+        `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
      );
    }
    const requiredProps = sortedProps.filter(k => required.has(k));
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -30,7 +30,7 @@
 #include <signal.h>
 #include <memory>

-using json = nlohmann::json;
+using json = nlohmann::ordered_json;

 bool server_verbose = false;
 bool server_log_json = true;
@ -847,7 +847,14 @@ struct server_context {
        slot.sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot.params.n_keep             = json_value(data, "n_keep",            slot.params.n_keep);
        slot.params.seed               = json_value(data, "seed",              default_params.seed);
-        if (data.contains("json_schema") && !data.contains("grammar")) {
+        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
+        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
+
+        // process "json_schema" and "grammar"
+        if (data.contains("json_schema") && data.contains("grammar")) {
+            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
+            return false;
+        } else if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
                auto schema                = json_value(data, "json_schema", json::object());
                slot.sparams.grammar       = json_schema_to_grammar(schema);
@ -858,8 +865,6 @@ struct server_context {
        } else {
            slot.sparams.grammar       = json_value(data, "grammar",           default_sparams.grammar);
        }
-        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
-        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);

        if (slot.params.cache_prompt && slot.ga_n != 1) {
            LOG_WARNING("cache_prompt is not supported with group-attention", {});
@ -1247,7 +1252,7 @@ struct server_context {
            {"penalize_nl",               slot.sparams.penalize_nl},
            {"stop",                      slot.params.antiprompt},
            {"n_predict",                 slot.params.n_predict}, // TODO: fix duplicate key n_predict
-            {"n_keep",                    params.n_keep},
+            {"n_keep",                    slot.params.n_keep},
            {"ignore_eos",                ignore_eos},
            {"stream",                    slot.params.stream},
            {"logit_bias",                slot.sparams.logit_bias},
@ -2208,7 +2213,11 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  -m FNAME, --model FNAME\n");
    printf("                            model path (default: %s)\n", params.model.c_str());
    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                            model download url (default: %s)\n", params.model_url.c_str());
+    printf("                            model download url (default: unused)\n");
+    printf("  -hfr REPO, --hf-repo REPO\n");
+    printf("                            Hugging Face model repository (default: unused)\n");
+    printf("  -hff FILE, --hf-file FILE\n");
+    printf("                            Hugging Face model file (default: unused)\n");
    printf("  -a ALIAS, --alias ALIAS\n");
    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@ -2225,7 +2234,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  -to N, --timeout N        server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
    printf("  --embeddings              enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
-    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
    printf("  -spf FNAME, --system-prompt-file FNAME\n");
    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  -ctk TYPE, --cache-type-k TYPE\n");
@ -2337,6 +2346,18 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                break;
            }
            params.model_url = argv[i];
+        } else if (arg == "-hfr" || arg == "--hf-repo") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hf_repo = argv[i];
+        } else if (arg == "-hff" || arg == "--hf-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hf_file = argv[i];
        } else if (arg == "-a" || arg == "--alias") {
            if (++i >= argc) {
                invalid_param = true;
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -4,7 +4,8 @@ Feature: Parallel

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
+    And   a model file test-model-00001-of-00003.gguf
    And   42 as server seed
    And   128 as batch size
    And   256 KV cache size
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -4,8 +4,8 @@ Feature: llama.cpp server

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
-    And   a model file stories260K.gguf
+    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file test-model.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -16,7 +16,6 @@ import numpy as np
 import openai
 from behave import step
 from behave.api.async_step import async_run_until_complete
-from huggingface_hub import hf_hub_download
 from prometheus_client import parser


@ -39,6 +38,8 @@ def step_server_config(context, server_fqdn, server_port):

    context.model_alias = None
    context.model_file = None
+    context.model_hf_repo = None
+    context.model_hf_file = None
    context.model_url = None
    context.n_batch = None
    context.n_ubatch = None
@ -68,9 +69,9 @@ def step_server_config(context, server_fqdn, server_port):

@step('a model file {hf_file} from HF repo {hf_repo}')
 def step_download_hf_model(context, hf_file, hf_repo):
-    context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
-    if context.debug:
-        print(f"model file: {context.model_file}")
+    context.model_hf_repo = hf_repo
+    context.model_hf_file = hf_file
+    context.model_file = os.path.basename(hf_file)


@step('a model file {model_file}')
@ -1079,6 +1080,10 @@ def start_server_background(context):
        server_args.extend(['--model', context.model_file])
    if context.model_url:
        server_args.extend(['--model-url', context.model_url])
+    if context.model_hf_repo:
+        server_args.extend(['--hf-repo', context.model_hf_repo])
+    if context.model_hf_file:
+        server_args.extend(['--hf-file', context.model_hf_file])
    if context.n_batch:
        server_args.extend(['--batch-size', context.n_batch])
    if context.n_ubatch:
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -12,7 +12,7 @@

 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

-using json = nlohmann::json;
+using json = nlohmann::ordered_json;

 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
@ -95,8 +95,8 @@ static inline void server_log(const char *level, const char *function, int line,

        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
-        fflush(stdout);
    }
+    fflush(stdout);
 }

 //
@ -352,51 +352,71 @@ static json oaicompat_completion_params_parse(
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body,   "model",             std::string("unknown"));
-    llama_params["prompt"]            = format_chat(model, chat_template,       body["messages"]);
-    llama_params["cache_prompt"]      = json_value(body,   "cache_prompt",      false);
-    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
-    llama_params["top_k"]             = json_value(body,   "top_k",             default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
-    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
-    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
    llama_params["frequency_penalty"] = json_value(body,   "frequency_penalty", 0.0);
+    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
+    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body,   "stream",            false);
-    llama_params["mirostat"]          = json_value(body,   "mirostat",          default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body,   "mirostat_tau",      default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body,   "mirostat_eta",      default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body,   "penalize_nl",       default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body,   "typical_p",         default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body,   "repeat_last_n",     default_sparams.penalty_last_n);
-    llama_params["ignore_eos"]        = json_value(body,   "ignore_eos",        false);
-    llama_params["tfs_z"]             = json_value(body,   "tfs_z",             default_sparams.tfs_z);
-    llama_params["n_keep"]            = json_value(body,   "n_keep",            0);
+    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);

-    if (body.contains("grammar")) {
-        llama_params["grammar"] = json_value(body, "grammar", json::object());
-    }
+    // Apply chat template to the list of messages
+    llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);

-    if (body.contains("response_format")) {
-        auto response_format = json_value(body, "response_format", json::object());
-        if (response_format.contains("type")) {
-            if (response_format["type"] == "json_object") {
-                llama_params["json_schema"] = json_value(response_format, "schema", json::object());
-            } else {
-                throw std::runtime_error("response_format type not supported: " + response_format["type"].dump());
-            }
-        }
-    }
-
-    // Handle 'stop' field
+    // Handle "stop" field
    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
+    // Some chat templates don't use EOS token to stop generation
+    // We must add their end sequences to list of stop words
+    llama_params["stop"].push_back("<|im_end|>"); // chatml
+    llama_params["stop"].push_back("<end_of_turn>"); // gemma

-    // Ensure there is ChatML-specific end sequence among stop words
-    llama_params["stop"].push_back("<|im_end|>");
+    // Handle "response_format" field
+    if (body.contains("response_format")) {
+        json response_format      = json_value(body, "response_format", json::object());
+        std::string response_type = json_value(response_format, "type", std::string());
+        if (response_type == "json_object") {
+            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+        } else if (!response_type.empty() && response_type != "text") {
+            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+        }
+    }
+
+    // Handle "n" field
+    int n_choices = json_value(body, "n", 1);
+    if (n_choices != 1) {
+        throw std::runtime_error("Only one completion choice is allowed");
+    }
+
+    // Handle "logprobs" field
+    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
+    if (body.contains("logprobs")) {
+        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
+    } else if (body.contains("top_logprobs")) {
+        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
+    }
+
+    // Params supported by OAI but unsupported by llama.cpp
+    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
+    for (auto & param : unsupported_params) {
+        if (body.contains(param)) {
+            throw std::runtime_error("Unsupported param: " + param);
+        }
+    }
+
+    // Copy remaining properties to llama_params
+    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
+    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
+    for (const auto & item : body.items()) {
+        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
+        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
+            llama_params[item.key()] = item.value();
+        }
+    }

    return llama_params;
 }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -219,7 +219,8 @@ int main(int argc, char ** argv) {
                if (params.sparams.temp > 0) {
                    // stochastic verification

-                    llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
+                    llama_sample_softmax(ctx_tgt, &dist_tgt);
                    float p_tgt = 0, p_dft = 0;

                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@ -3,9 +3,13 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT

-mkdir -p build
+
+IF not exist build (mkdir build)
 cd build
+if %errorlevel% neq 0 goto ERROR
+
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
@ -13,11 +17,18 @@ cd build

 ::  for FP32
 cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
-
-
+if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main

 ::  build all binary
 make -j
+if %errorlevel% neq 0 goto ERROR
+
 cd ..
+exit /B 0
+
+:ERROR
+echo comomand error: %errorlevel%
+exit /B %errorlevel%
+
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -771,7 +771,11 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
        if (src_ctx->device == dst_ctx->device) {
            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
        } else {
+#ifdef GGML_CUDA_NO_PEER_COPY
+            return false;
+#else
            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
+#endif
        }
        CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
        return true;
@ -11322,19 +11326,23 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);

+        // copy on src stream
+        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
+        } else {
+#ifdef GGML_CUDA_NO_PEER_COPY
+            return false;
+#else
+            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
+#endif
+        }
+
+        // record event on src stream
        if (!cuda_ctx_src->copy_event) {
            ggml_cuda_set_device(cuda_ctx_src->device);
            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
        }

-        // copy on src stream
-        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
-            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
-        } else {
-            CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
-        }
-
-        // record event on src stream
        CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));

        // wait on dst stream for the copy to complete
@ -11530,6 +11538,9 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
 }

 static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
+#ifdef GGML_CUDA_NO_PEER_COPY
+    return nullptr;
+#else
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    ggml_cuda_set_device(cuda_ctx->device);
@ -11541,6 +11552,7 @@ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend)
        /* .backend = */ backend,
        /* .context = */ event,
    };
+#endif
 }

 static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {
@ -11646,7 +11658,7 @@ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, si
 }

 GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
-    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+    if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
        return false;
    }

@ -11663,6 +11675,10 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
 }

 GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
+    if (getenv("GGML_CUDA_REGISTER_HOST") == nullptr) {
+        return;
+    }
+
    cudaError_t err = cudaHostUnregister(buffer);
    if (err != cudaSuccess) {
        // clear the error
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -1392,6 +1392,14 @@ static enum ggml_status ggml_metal_graph_compute(
                            (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
                            //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);

+                            // some Metal matrix data types require aligned pointers
+                            // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+                            switch (src0->type) {
+                                case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
+                                case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8  == 0); break;
+                                default: break;
+                            }
+
                            id<MTLComputePipelineState> pipeline = nil;

                            switch (src0->type) {
@ -1706,6 +1714,14 @@ static enum ggml_status ggml_metal_graph_compute(
                            ne20 % 32 == 0 && ne20 >= 64 &&
                            ne11 > ne11_mm_min) {

+                            // some Metal matrix data types require aligned pointers
+                            // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
+                            switch (src0->type) {
+                                case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break;
+                                case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8  == 0); break;
+                                default: break;
+                            }
+
                            id<MTLComputePipelineState> pipeline = nil;

                            switch (src2->type) {
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }

 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if __AVXVNNI__
+#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
    const __m256i zero = _mm256_setzero_si256();
    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
    return _mm256_cvtepi32_ps(summed_pairs);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -740,11 +740,7 @@ namespace dpct

        sycl::queue &default_queue()
        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return out_of_order_queue();
-#else
            return in_order_queue();
-#endif // DPCT_USM_LEVEL_NONE
        }

        void queues_wait_and_throw()
@ -763,11 +759,7 @@ namespace dpct

        sycl::queue *create_queue(bool enable_exception_handler = false)
        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return create_out_of_order_queue(enable_exception_handler);
-#else
            return create_in_order_queue(enable_exception_handler);
-#endif // DPCT_USM_LEVEL_NONE
        }

        sycl::queue *create_queue(sycl::context context, sycl::device device,
@ -1075,11 +1067,6 @@ namespace dpct
        static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
                                                              const void *ptr)
        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return mem_mgr::instance().is_device_ptr(ptr)
-                       ? pointer_access_attribute::device_only
-                       : pointer_access_attribute::host_only;
-#else
            switch (sycl::get_pointer_type(ptr, q.get_context()))
            {
            case sycl::usm::alloc::unknown:
@ -1090,7 +1077,6 @@ namespace dpct
            case sycl::usm::alloc::host:
                return pointer_access_attribute::host_device;
            }
-#endif
        }

        template <typename ArgT>
@ -1273,11 +1259,7 @@ namespace dpct

        static inline void *dpct_malloc(size_t size, sycl::queue &q)
        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
-#else
            return sycl::malloc_device(size, q.get_device(), q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
        }

 #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
@ -1301,25 +1283,7 @@ namespace dpct
        static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
                                              valueT value, size_t size)
        {
-#ifdef DPCT_USM_LEVEL_NONE
-            auto &mm = mem_mgr::instance();
-            assert(mm.is_device_ptr(dev_ptr));
-            auto alloc = mm.translate_ptr(dev_ptr);
-            size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
-
-            return q.submit([&](sycl::handler &cgh)
-                            {
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    auto new_buffer = alloc.buffer.reinterpret<valueT>(
-        sycl::range<1>(alloc.size / sizeof(valueT)));
-    sycl::accessor<valueT, 1, sycl::access_mode::write,
-                sycl::access::target::device>
-        acc(new_buffer, cgh, r, o);
-    cgh.fill(acc, value); });
-#else
            return q.fill(dev_ptr, value, size);
-#endif // DPCT_USM_LEVEL_NONE
        }

        /**
@ -1413,72 +1377,8 @@ namespace dpct
        {
            if (!size)
                return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-            auto &mm = mem_mgr::instance();
-            auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-            switch (real_direction)
-            {
-            case host_to_host:
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
-            case host_to_device:
-            {
-                auto alloc = mm.translate_ptr(to_ptr);
-                size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        acc(alloc.buffer, cgh, r, o);
-    cgh.copy(from_ptr, acc); });
-            }
-            case device_to_host:
-            {
-                auto alloc = mm.translate_ptr(from_ptr);
-                size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        acc(alloc.buffer, cgh, r, o);
-    cgh.copy(acc, to_ptr); });
-            }
-            case device_to_device:
-            {
-                auto to_alloc = mm.translate_ptr(to_ptr);
-                auto from_alloc = mm.translate_ptr(from_ptr);
-                size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-                size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto to_o = sycl::id<1>(to_offset);
-    auto from_o = sycl::id<1>(from_offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        to_acc(to_alloc.buffer, cgh, r, to_o);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        from_acc(from_alloc.buffer, cgh, r, from_o);
-    cgh.copy(from_acc, to_acc); });
-            }
-            default:
-                throw std::runtime_error("dpct_memcpy: invalid direction value");
-            }
-#else
            return q.memcpy(to_ptr, from_ptr, size, dep_events);
            GGML_UNUSED(direction);
-#endif // DPCT_USM_LEVEL_NONE
        }

        // Get actual copy range and make sure it will not exceed range.
@ -1618,36 +1518,7 @@ namespace dpct
                break;
            }
            case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-            {
-                auto &mm = mem_mgr::instance();
-                auto to_alloc = mm.translate_ptr(to_surface);
-                auto from_alloc = mm.translate_ptr(from_surface);
-                size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-                size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-                event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                              {
-    cgh.depends_on(dep_events);
-    auto to_o = sycl::id<1>(to_offset);
-    auto from_o = sycl::id<1>(from_offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        to_acc(to_alloc.buffer, cgh,
-                get_copy_range(size, to_slice, to_range.get(0)), to_o);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        from_acc(from_alloc.buffer, cgh,
-                get_copy_range(size, from_slice, from_range.get(0)), from_o);
-    cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-        size,
-        [=](sycl::id<3> id) {
-            to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                from_acc[get_offset(id, from_slice, from_range.get(0))];
-        }); }));
-            }
-#else
-                event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                              {
+                event_list.push_back(q.submit([&](sycl::handler &cgh){
                cgh.depends_on(dep_events);
                cgh.parallel_for<class dpct_memcpy_3d_detail>(
                    size,
@ -1655,7 +1526,6 @@ namespace dpct
                        to_surface[get_offset(id, to_slice, to_range.get(0))] =
                            from_surface[get_offset(id, from_slice, from_range.get(0))];
                    }); }));
-#endif
                break;
            default:
                throw std::runtime_error("dpct_memcpy: invalid direction value");
@ -1754,11 +1624,7 @@ namespace dpct
        {
            if (ptr)
            {
-#ifdef DPCT_USM_LEVEL_NONE
-                detail::mem_mgr::instance().mem_free(ptr);
-#else
                sycl::free(ptr, q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
            }
        }

@ -1766,11 +1632,7 @@ namespace dpct
        inline auto get_memory(const void *x)
        {
            T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
-#ifdef DPCT_USM_LEVEL_NONE
-            return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
-#else
            return new_x;
-#endif
        }

        template <typename T>
@ -2222,72 +2084,8 @@ namespace dpct
    {
        if (!size)
            return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-        auto &mm = mem_mgr::instance();
-        auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-        switch (real_direction)
-        {
-        case host_to_host:
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
-        case host_to_device:
-        {
-            auto alloc = mm.translate_ptr(to_ptr);
-            size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto o = sycl::id<1>(offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            acc(alloc.buffer, cgh, r, o);
-        cgh.copy(from_ptr, acc); });
-        }
-        case device_to_host:
-        {
-            auto alloc = mm.translate_ptr(from_ptr);
-            size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto o = sycl::id<1>(offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            acc(alloc.buffer, cgh, r, o);
-        cgh.copy(acc, to_ptr); });
-        }
-        case device_to_device:
-        {
-            auto to_alloc = mm.translate_ptr(to_ptr);
-            auto from_alloc = mm.translate_ptr(from_ptr);
-            size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-            size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto to_o = sycl::id<1>(to_offset);
-        auto from_o = sycl::id<1>(from_offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            to_acc(to_alloc.buffer, cgh, r, to_o);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            from_acc(from_alloc.buffer, cgh, r, from_o);
-        cgh.copy(from_acc, to_acc); });
-        }
-        default:
-            throw std::runtime_error("dpct_memcpy: invalid direction value");
-        }
-#else
        return q.memcpy(to_ptr, from_ptr, size, dep_events);
        GGML_UNUSED(direction);
-#endif // DPCT_USM_LEVEL_NONE
    }

    // Get actual copy range and make sure it will not exceed range.
@ -2427,34 +2225,6 @@ namespace dpct
            break;
        }
        case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-        {
-            auto &mm = mem_mgr::instance();
-            auto to_alloc = mm.translate_ptr(to_surface);
-            auto from_alloc = mm.translate_ptr(from_surface);
-            size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-            size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-            event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                          {
-        cgh.depends_on(dep_events);
-        auto to_o = sycl::id<1>(to_offset);
-        auto from_o = sycl::id<1>(from_offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            to_acc(to_alloc.buffer, cgh,
-                    get_copy_range(size, to_slice, to_range.get(0)), to_o);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            from_acc(from_alloc.buffer, cgh,
-                    get_copy_range(size, from_slice, from_range.get(0)), from_o);
-        cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-            size,
-            [=](sycl::id<3> id) {
-                to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                    from_acc[get_offset(id, from_slice, from_range.get(0))];
-            }); }));
-        }
-#else
            event_list.push_back(q.submit([&](sycl::handler &cgh)
                                          {
        cgh.depends_on(dep_events);
@ -2464,7 +2234,6 @@ namespace dpct
                to_surface[get_offset(id, to_slice, to_range.get(0))] =
                    from_surface[get_offset(id, from_slice, from_range.get(0))];
            }); }));
-#endif
        break;
        default:
            throw std::runtime_error("dpct_memcpy: invalid direction value");
@ -2655,9 +2424,6 @@ namespace dpct
                           void *c[], library_data_t c_type, int ldc,
                           int batch_size, library_data_t scaling_type)
    {
-#ifdef DPCT_USM_LEVEL_NONE
-        throw std::runtime_error("this API is unsupported when USM level is none");
-#else
        if (scaling_type == library_data_t::real_float &&
            c_type == library_data_t::complex_float)
        {
@ -2792,7 +2558,6 @@ namespace dpct
        default:
            throw std::runtime_error("the combination of data type is unsupported");
        }
-#endif
    }

    /// Computes a batch of matrix-matrix product with general matrices.
@ -3131,24 +2896,9 @@ namespace dpct
            template <size_t D = Dimension>
            typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
                init();
-        #ifdef DPCT_USM_LEVEL_NONE
-                return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
-                        _device_ptr)
-                    .template get_access<sycl::access_mode::read_write>()[index];
-        #else
                return _device_ptr[index];
-        #endif // DPCT_USM_LEVEL_NONE
            }

-        #ifdef DPCT_USM_LEVEL_NONE
-            /// Get sycl::accessor for the device memory object when usm is not used.
-            accessor_t get_access(sycl::handler &cgh) {
-                return get_buffer(_device_ptr)
-                    .template reinterpret<T, Dimension>(_range)
-                    .template get_access<detail::memory_traits<Memory, T>::mode,
-                                        detail::memory_traits<Memory, T>::target>(cgh);
-            }
-        #else
            /// Get dpct::accessor with dimension info for the device memory object
            /// when usm is used and dimension is greater than 1.
            template <size_t D = Dimension>
@ -3156,7 +2906,6 @@ namespace dpct
            get_access(sycl::handler &cgh) {
                return dpct_accessor_t((T *)_device_ptr, _range);
            }
-        #endif // DPCT_USM_LEVEL_NONE

        private:
            device_memory(value_t *memory_ptr, size_t size)
@ -3201,15 +2950,6 @@ namespace dpct

            /// Default constructor
            device_memory() : base(1) {}
-
-        #ifdef DPCT_USM_LEVEL_NONE
-            /// Get sycl::accessor for the device memory object when usm is not used.
-            accessor_t get_access(sycl::handler &cgh) {
-                auto buf = get_buffer(base::get_ptr())
-                            .template reinterpret<T, 1>(sycl::range<1>(1));
-                return accessor_t(buf, cgh);
-            }
-        #endif // DPCT_USM_LEVEL_NONE
        };
        } // namespace detail

@ -13181,7 +12921,7 @@ int get_work_group_size(int user_device_id) {
    return prop.get_max_work_group_size();
 }

-void ggml_init_sycl() try {
+static void ggml_init_sycl() try {
    static bool initialized = false;

    if (!initialized) {
@ -16677,6 +16417,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
 };

 ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
+    ggml_init_sycl();
    if (device_index>=g_device_count or device_index<0) {
        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
            device_index, g_device_count-1);
@ -17046,6 +16787,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
 };

 GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
+    ggml_init_sycl();
    // FIXME: this is not thread safe
    static std::map<std::array<float, GGML_SYCL_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;

@ -17379,6 +17121,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
    UNUSED(backend);
 }

+GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
+    const int min_batch_size = 32;
+    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
+    GGML_UNUSED(backend);
+}
+
+
 static ggml_backend_i ggml_backend_sycl_interface = {
    /* .get_name                = */ ggml_backend_sycl_name,
    /* .free                    = */ ggml_backend_sycl_free,
@ -17392,7 +17141,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
    /* .supports_op             = */ ggml_backend_sycl_supports_op,
-    /* .offload_op              = */ NULL,
+    /* .offload_op              = */ ggml_backend_sycl_offload_op,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
    /* .event_record            = */ NULL,
@ -17406,7 +17155,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
 }

 GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
-    ggml_init_sycl(); // TODO: remove from ggml.c
+    ggml_init_sycl();

    check_allow_gpu_index(device);

--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@ -16,16 +16,22 @@ extern "C" {
 #define GGML_SYCL_MAX_DEVICES       48
 #define GGML_SYCL_NAME "SYCL"

-GGML_API void   ggml_init_sycl(void);
-GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+// backend API
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
+
+// devide buffer
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
 GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
 GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);

@ -34,6 +40,10 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
 GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
 GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
+
+// SYCL doesn't support registering host memory, keep here for reference
+// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
@ -3,6 +3,7 @@

 #include "ggml-impl.h"
 #include "ggml-quants.h"
+#include "ggml.h"

 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@ -43,6 +44,10 @@

 #if defined(_WIN32)

+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
 #include <windows.h>

 typedef volatile LONG atomic_int;
@ -286,8 +291,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #include "ggml-opencl.h"
 #elif defined(GGML_USE_VULKAN)
 #include "ggml-vulkan.h"
-#elif defined(GGML_USE_SYCL)
-#include "ggml-sycl.h"
 #endif

 // floating point type used to accumulate sums
@ -430,6 +433,57 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif

+//
+// cross-platform UTF-8 file paths
+//
+
+#ifdef _WIN32
+static wchar_t * ggml_mbstowcs(const char * mbs) {
+    int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
+    if (!wlen) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
+    wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
+    if (!wlen) {
+        GGML_FREE(wbuf);
+        errno = EINVAL;
+        return NULL;
+    }
+
+    return wbuf;
+}
+#endif
+
+FILE * ggml_fopen(const char * fname, const char * mode) {
+#ifdef _WIN32
+    FILE * file = NULL;
+
+    // convert fname (UTF-8)
+    wchar_t * wfname = ggml_mbstowcs(fname);
+    if (wfname) {
+        // convert mode (ANSI)
+        wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
+        wchar_t * wmode_p = wmode;
+        do {
+            *wmode_p++ = (wchar_t)*mode;
+        } while (*mode++);
+
+        // open file
+        file = _wfopen(wfname, wmode);
+
+        GGML_FREE(wfname);
+        GGML_FREE(wmode);
+    }
+
+    return file;
+#else
+    return fopen(fname, mode);
+#endif
+}
+
 //
 // cache line
 //
@ -2642,8 +2696,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
        ggml_cl_init();
 #elif defined(GGML_USE_VULKAN)
        ggml_vk_init_cpu_assist();
-#elif defined(GGML_USE_SYCL)
-        ggml_init_sycl();
 #endif

        ggml_setup_op_has_task_pass();
@ -16059,12 +16111,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
 #endif // GGML_USE_VULKAN

-#ifdef GGML_USE_SYCL
-    bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
-    if (skip_cpu) {
-        return;
-    }
-#endif // GGML_USE_SYCL
    switch (tensor->op) {
        case GGML_OP_DUP:
            {
@ -18739,7 +18785,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {

    // write binary data
    {
-        FILE * fout = fopen(fname, "wb");
+        FILE * fout = ggml_fopen(fname, "wb");

        if (!fout) {
            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
@ -18877,7 +18923,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *

    // read file into data
    {
-        FILE * fin = fopen(fname, "rb");
+        FILE * fin = ggml_fopen(fname, "rb");
        if (!fin) {
            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
            return result;
@ -19213,7 +19259,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
 void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
    char color[16];

-    FILE * fp = fopen(filename, "w");
+    FILE * fp = ggml_fopen(filename, "w");
    GGML_ASSERT(fp);

    fprintf(fp, "digraph G {\n");
@ -20531,7 +20577,7 @@ struct gguf_context * gguf_init_empty(void) {
 }

 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
-    FILE * file = fopen(fname, "rb");
+    FILE * file = ggml_fopen(fname, "rb");
    if (!file) {
        return NULL;
    }
@ -21486,7 +21532,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
 }

 void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
-    FILE * file = fopen(fname, "wb");
+    FILE * file = ggml_fopen(fname, "wb");
    if (!file) {
        GGML_ASSERT(false && "failed to open file for writing");
    }
--- a/ggml.h
+++ b/ggml.h
@ -214,9 +214,10 @@
 #    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#include <stdint.h>
-#include <stddef.h>
 #include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>

 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1
@ -708,6 +709,9 @@ extern "C" {

    GGML_API void    ggml_print_backtrace(void);

+    // accepts a UTF-8 path, even on Windows
+    GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
+
    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -100,6 +100,7 @@ class MODEL_ARCH(IntEnum):
    LLAMA      = auto()
    FALCON     = auto()
    BAICHUAN   = auto()
+    GROK       = auto()
    GPT2       = auto()
    GPTJ       = auto()
    GPTNEOX    = auto()
@ -167,6 +168,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.LLAMA:          "llama",
    MODEL_ARCH.FALCON:         "falcon",
    MODEL_ARCH.BAICHUAN:       "baichuan",
+    MODEL_ARCH.GROK:           "grok",
    MODEL_ARCH.GPT2:           "gpt2",
    MODEL_ARCH.GPTJ:           "gptj",
    MODEL_ARCH.GPTNEOX:        "gptneox",
@ -251,6 +253,28 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
+    MODEL_ARCH.GROK: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.ATTN_OUT_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.LAYER_OUT_NORM,
+    ],
    MODEL_ARCH.GPTNEOX: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -23,6 +23,7 @@ class TensorNameMap:
            "model.embedding",                           # mamba-qbert
            "backbone.embedding",                        # mamba
            "backbone.embeddings",                       # mamba-hf
+            "transformer.in_out_embed",                  # Grok
        ),

        # Token type embeddings
@ -66,6 +67,7 @@ class TensorNameMap:
            "lm_head.ln",                              # phi2
            "model.norm_f",                            # mamba-qbert
            "backbone.norm_f",                         # mamba
+            "transformer.rms_norm",                    # Grok
        ),

        # Rope frequencies
@ -93,6 +95,7 @@ class TensorNameMap:
            "model.layers.{bid}.attention_norm",                    # internlm2
            "model.layers.{bid}.norm",                              # mamba-qbert
            "backbone.layers.{bid}.norm",                           # mamba
+            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
        ),

        # Attention norm 2
@ -121,7 +124,8 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.self.query",                  # bert
            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
-            "model.layers.{bid}.attention.wq"             # internlm2
+            "model.layers.{bid}.attention.wq",                           # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
        ),

        # Attention key
@ -131,7 +135,8 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.self.key",                  # bert
            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
-            "model.layers.{bid}.attention.wk"             # internlm2
+            "model.layers.{bid}.attention.wk",                         # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
        ),

        # Attention value
@ -141,7 +146,8 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.self.value",                  # bert
            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
            "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
-            "model.layers.{bid}.attention.wv"             # internlm2
+            "model.layers.{bid}.attention.wv",                           # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
        ),

        # Attention output
@ -162,12 +168,14 @@ class TensorNameMap:
            "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
            "model.layers.{bid}.attention.wo",                           # internlm2
            "encoder.layers.{bid}.attn.out_proj",                        # nomic-bert
+            "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
        ),

        # Attention output norm
        MODEL_TENSOR.ATTN_OUT_NORM: (
            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
            "encoder.layers.{bid}.norm1",                      # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
        ),

        # Rotary embeddings
@ -190,11 +198,13 @@ class TensorNameMap:
            "model.layers.{bid}.ln2",                                        # yi
            "h.{bid}.ln_2",                                                  # gpt2
            "model.layers.{bid}.ffn_norm",                                   # internlm2
+            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
        ),

        MODEL_TENSOR.FFN_GATE_INP: (
            "layers.{bid}.feed_forward.gate",           # mixtral
            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
+            "transformer.decoder_layer.{bid}.router"    # Grok
        ),

        # Feed-forward up
@ -223,6 +233,7 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_UP_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear_v",   # Grok
        ),

        # AWQ-activation gate
@ -243,6 +254,7 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_GATE_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear"      # Grok
        ),

        # Feed-forward down
@ -270,6 +282,8 @@ class TensorNameMap:
        MODEL_TENSOR.FFN_DOWN_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
+            "transformer.decoder_layer.{bid}.moe.{xid}.linear_1",   # Grok
+
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (
@ -289,6 +303,7 @@ class TensorNameMap:
        MODEL_TENSOR.LAYER_OUT_NORM: (
            "encoder.layer.{bid}.output.LayerNorm",         # bert
            "encoder.layers.{bid}.norm2",                   # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
        ),

        MODEL_TENSOR.SSM_IN: (
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -277,6 +277,8 @@ extern "C" {
    typedef struct llama_model_quantize_params {
        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;              // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;   // output tensor type
+        enum ggml_type token_embedding_type; // itoken embeddings tensor type
        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor;         // quantize output.weight
        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@ -960,6 +962,16 @@ extern "C" {
                                int32_t   n_past,
                                int32_t   n_predict);

+    /// @details Build a split GGUF final path for this chunk.
+    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
+    //  Returns the split_path length.
+    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
+
+    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
+    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
+    //  Returns the split_prefix length.
+    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
+
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);

--- a/scripts/get-wikitext-103.sh
+++ b/scripts/get-wikitext-103.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
+
+echo "Usage:"
+echo ""
+echo "  ./perplexity -m model.gguf -f wiki.test.raw [other params]"
+echo ""
+
+exit 0
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -2091,6 +2091,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        }
    }

+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,   64, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
+
    for (ggml_type type_a : all_types) {
        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
            for (int n_mats : {2, 4, 8}) {
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@ -90,7 +90,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase

    test({
        FAILURE,
-        "invalid type type",
+        "invalid type",
        R"""({
            "type": 123
        })""",
@ -193,21 +193,27 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
    });

    test({
-        FAILURE,
+        SUCCESS,
        "non-string const",
        R"""({
            "const": 123
        })""",
-        ""
+        R"""(
+            root ::= "123"
+            space ::= " "?
+        )"""
    });

    test({
-        FAILURE,
+        SUCCESS,
        "non-string enum",
        R"""({
-            "enum": [123]
+            "enum": ["red", "amber", "green", null, 42, ["foo"]]
        })""",
-        ""
+        R"""(
+            root ::= "\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]"
+            space ::= " "?
+        )"""
    });

    test({
@ -378,20 +384,18 @@ static void test_all(const std::string & lang, std::function<void(const TestCase

    test({
        SUCCESS,
-        "required props",
+        "required props in original order",
        R"""({
            "type": "object",
            "properties": {
-                "a": {
-                "type": "string"
-                },
-                "b": {
-                "type": "string"
-                }
+                "b": {"type": "string"},
+                "c": {"type": "string"},
+                "a": {"type": "string"}
            },
            "required": [
                "a",
-                "b"
+                "b",
+                "c"
            ],
            "additionalProperties": false,
            "definitions": {}
@ -399,7 +403,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
        R"""(
            a-kv ::= "\"a\"" space ":" space string
            b-kv ::= "\"b\"" space ":" space string
-            root ::= "{" space a-kv "," space b-kv "}" space
+            c-kv ::= "\"c\"" space ":" space string
+            root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
@ -458,13 +463,13 @@ static void test_all(const std::string & lang, std::function<void(const TestCase

    test({
        SUCCESS,
-        "required + optional props",
+        "required + optional props each in original order",
        R"""({
            "properties": {
-                "a": {"type": "string"},
                "b": {"type": "string"},
-                "c": {"type": "string"},
-                "d": {"type": "string"}
+                "a": {"type": "string"},
+                "d": {"type": "string"},
+                "c": {"type": "string"}
            },
            "required": ["a", "b"],
            "additionalProperties": false
@ -473,9 +478,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
            a-kv ::= "\"a\"" space ":" space string
            b-kv ::= "\"b\"" space ":" space string
            c-kv ::= "\"c\"" space ":" space string
-            c-rest ::= ( "," space d-kv )?
            d-kv ::= "\"d\"" space ":" space string
-            root ::= "{" space a-kv "," space b-kv ( "," space ( c-kv c-rest | d-kv ) )? "}" space
+            d-rest ::= ( "," space c-kv )?
+            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
            space ::= " "?
            string ::=  "\"" (
                    [^"\\] |
@ -794,27 +799,40 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
 }

 int main() {
+    fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
+    fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
+
    test_all("C++", [](const TestCase & tc) {
        try {
-            tc.verify(json_schema_to_grammar(nlohmann::json::parse(tc.schema)));
+            tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema)));
            tc.verify_status(SUCCESS);
        } catch (const std::runtime_error & ex) {
            fprintf(stderr, "Error: %s\n", ex.what());
            tc.verify_status(FAILURE);
        }
    });
+
+    if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) {
        test_all("Python", [](const TestCase & tc) {
            write("test-json-schema-input.tmp", tc.schema);
            tc.verify_status(std::system(
                "python ./examples/json-schema-to-grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
            tc.verify(read("test-grammar-output.tmp"));
        });
+    } else {
+        fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m");
+    }
+
+    if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
        test_all("JavaScript", [](const TestCase & tc) {
            write("test-json-schema-input.tmp", tc.schema);
            tc.verify_status(std::system(
                "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
            tc.verify(read("test-grammar-output.tmp"));
        });
+    } else {
+        fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
+    }

    test_all("Check Expectations Validity", [](const TestCase & tc) {
        if (tc.expected_status == SUCCESS) {