Merge upstream changes, fix conflict

2023-10-14 10:51:53 +02:00 · 2023-10-14 10:51:53 +02:00 · 35b10d149f
commit 35b10d149f
parent bd054470b9 2a4bcbacea
105 changed files with 22879 additions and 4275 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,6 +1,9 @@
 *.o
 *.a
 .cache/
 .git/
 .github/
 .gitignore
 .vs/
 .vscode/
 .DS_Store
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -188,7 +188,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
+          cmake ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
@ -253,6 +253,34 @@ jobs:
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
  macOS-latest-swift:
    runs-on: macos-latest
    strategy:
      matrix:
        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
      - name: Dependencies
        id: depends
        continue-on-error: true
        run: |
          brew update
      - name: xcodebuild for swift package
        id: xcodebuild
        run: |
          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
      - name: Build Swift Example
        id: make_build_swift_example
        run: |
            make swift
  windows-latest-cmake:
    runs-on: windows-latest
@ -265,17 +293,17 @@ jobs:
      matrix:
        include:
          - build: 'noavx'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'avx'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'clblast'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
    steps:
      - name: Clone
@ -414,7 +442,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
      - name: Determine tag name
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@ -36,8 +36,9 @@ jobs:
        poetry install
    - name: Build package
-      run: poetry build
+      run: cd gguf-py && poetry build
    - name: Publish package
      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
        packages-dir: gguf-py/dist
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@ -0,0 +1,25 @@
 name: Zig CI
 on:
  pull_request:
  push:
    branches:
      - master
 jobs:
  build:
    strategy:
      fail-fast: false
      matrix:
        runs-on: [ubuntu-latest, macos-latest, windows-latest]
    runs-on: ${{ matrix.runs-on }}
    steps:
      - uses: actions/checkout@v3
        with:
          submodules: recursive
          fetch-depth: 0
      - uses: goto-bus-stop/setup-zig@v2
        with:
          version: 0.11.0
      - name: Build Summary
        run: zig build --summary all -freference-trace
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@
 *.gcno
 *.gcda
 *.dot
 *.metallib
 .DS_Store
 .build/
 .cache/
@ -40,8 +41,10 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
 /infill
 /libllama.so
 /llama-bench
 /llava
 /main
 /metal
 /perplexity
@ -53,6 +56,7 @@ models-mnt
 /server
 /simple
 /batched
 /batched-bench
 /export-lora
 /finetune
 /speculative
@ -90,4 +94,5 @@ tests/test-quantize-perf
 tests/test-sampling
 tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
-tests/test-tokenizer-1
+tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
+cmake_minimum_required(VERSION 3.13)  # for add_link_options
 project("llama.cpp" C CXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -44,7 +44,7 @@ endif()
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 # debug
@ -58,15 +58,21 @@ option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"
 option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)
 # instruction set specific
-option(LLAMA_AVX                        "llama: enable AVX"                                     ON)
+if (LLAMA_NATIVE)
-option(LLAMA_AVX2                       "llama: enable AVX2"                                    ON)
+    set(INS_ENB OFF)
-option(LLAMA_AVX512                     "llama: enable AVX512"                                  OFF)
+else()
-option(LLAMA_AVX512_VBMI                "llama: enable AVX512-VBMI"                             OFF)
+    set(INS_ENB ON)
-option(LLAMA_AVX512_VNNI                "llama: enable AVX512-VNNI"                             OFF)
+endif()
-option(LLAMA_FMA                        "llama: enable FMA"                                     ON)
+
 option(LLAMA_AVX                             "llama: enable AVX"                                ${INS_ENB})
 option(LLAMA_AVX2                            "llama: enable AVX2"                               ${INS_ENB})
 option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
 option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
 option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
 option(LLAMA_FMA                             "llama: enable FMA"                                ${INS_ENB})
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
-    option(LLAMA_F16C                   "llama: enable F16C"                                    ON)
+    option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
 endif()
 # 3rd party libs
@ -344,8 +350,9 @@ if (LLAMA_MPI)
        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
-        set(cxx_flags ${cxx_flags} -Wno-cast-qual)
+        if (NOT MSVC)
-        set(c_flags   ${c_flags}   -Wno-cast-qual)
+            add_compile_options(-Wno-cast-qual)
        endif()
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
        # Even if you're only using the C header, C++ programs may bring in MPI
@ -432,13 +439,13 @@ endif()
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
+        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
            -Werror=implicit-function-declaration)
        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
        set(host_cxx_flags "")
        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-            set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
+            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
            if (
                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
@ -448,27 +455,39 @@ if (LLAMA_ALL_WARNINGS)
            endif()
        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
            set(c_flags ${c_flags} -Wdouble-promotion)
-            set(cxx_flags ${cxx_flags} -Wno-array-bounds)
+            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-                set(cxx_flags ${cxx_flags} -Wno-format-truncation)
+                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
            endif()
            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
-                set(cxx_flags ${cxx_flags} -Wextra-semi)
+                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
            endif()
        endif()
    else()
        # todo : msvc
    endif()
-    add_compile_options(
+    set(c_flags   ${c_flags}   ${warning_flags})
-            ${warning_flags}
+    set(cxx_flags ${cxx_flags} ${warning_flags})
-            "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
-            "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
-    )
+                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
 endif()
 if (NOT MSVC)
    set(cuda_flags -Wno-pedantic)
 endif()
 set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
 list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
 if (NOT cuda_host_flags STREQUAL "")
    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
 endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@ -508,9 +527,6 @@ if (NOT MSVC)
    if (LLAMA_GPROF)
        add_compile_options(-pg)
    endif()
    if (LLAMA_NATIVE)
        add_compile_options(-march=native)
    endif()
 endif()
 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
@ -565,6 +581,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
        endif()
    else()
        if (LLAMA_NATIVE)
            add_compile_options(-march=native)
        endif()
        if (LLAMA_F16C)
            add_compile_options(-mf16c)
        endif()
@ -661,6 +680,8 @@ add_library(ggml OBJECT
            ggml.h
            ggml-alloc.c
            ggml-alloc.h
            ggml-backend.c
            ggml-backend.h
            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
@ -722,6 +743,7 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
 set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
 set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
 set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
 get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
 configure_package_config_file(
        ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
--- a/122
+++ b/122
@ -1,8 +1,14 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative benchmark-matmult parallel finetune export-lora tests/test-c.o
+BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search  \
 	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
 # Binaries only useful for tests
-TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
+TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -62,9 +68,11 @@ test: $(TEST_TARGETS)
 		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
-			continue; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
 			continue; \
 		else \
 			echo "Running test $$test_target..."; \
 			./$$test_target; \
@ -170,6 +178,24 @@ else
 	MK_CPPFLAGS += -DNDEBUG
 endif
 ifdef LLAMA_SANITIZE_THREAD
 	MK_CFLAGS   += -fsanitize=thread -g
 	MK_CXXFLAGS += -fsanitize=thread -g
 	MK_LDFLAGS  += -fsanitize=thread -g
 endif
 ifdef LLAMA_SANITIZE_ADDRESS
 	MK_CFLAGS   += -fsanitize=address -fno-omit-frame-pointer -g
 	MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
 	MK_LDFLAGS  += -fsanitize=address -fno-omit-frame-pointer -g
 endif
 ifdef LLAMA_SANITIZE_UNDEFINED
 	MK_CFLAGS   += -fsanitize=undefined -g
 	MK_CXXFLAGS += -fsanitize=undefined -g
 	MK_LDFLAGS  += -fsanitize=undefined -g
 endif
 ifdef LLAMA_SERVER_VERBOSE
 	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif
@ -519,12 +545,21 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-OBJS += ggml-alloc.o
+ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
+OBJS += ggml-alloc.o ggml-backend.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-common.o: common/common.cpp common/common.h build-info.h common/log.h
+COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
 COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 console.o: common/console.cpp common/console.h
@ -546,16 +581,22 @@ clean:
 # Examples
 #
-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
-simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o common.o $(OBJS)
+infill: examples/infill/infill.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o common.o $(OBJS)
+simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 batched-bench: examples/batched-bench/batched-bench.cpp       build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
@ -564,53 +605,56 @@ quantize: examples/quantize/quantize.cpp                      build-info.h ggml.
 quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o common.o $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o common.o $(OBJS)
+embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
-$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
-embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
+llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
+finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ifdef LLAMA_METAL
@ -618,6 +662,11 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
 endif
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh $(CC) > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@ -638,7 +687,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 run-benchmark-matmult: benchmark-matmult
 	./$@
-.PHONY: run-benchmark-matmult
+.PHONY: run-benchmark-matmult swift
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@ -646,37 +695,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-c.o: tests/test-c.c llama.h
--- a/Package.swift
+++ b/Package.swift
@ -1,24 +1,27 @@
-// swift-tools-version:5.3
+// swift-tools-version:5.5
 import PackageDescription
 #if arch(arm) || arch(arm64)
 let platforms: [SupportedPlatform]? = [
-    .macOS(.v11),
+    .macOS(.v12),
    .iOS(.v14),
    .watchOS(.v4),
    .tvOS(.v14)
 ]
 let exclude: [String] = []
-let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
+let resources: [Resource] = [
    .process("ggml-metal.metal")
 ]
 let additionalSources: [String] = ["ggml-metal.m"]
 let additionalSettings: [CSetting] = [
    .unsafeFlags(["-fno-objc-arc"]),
    .define("GGML_SWIFT"),
    .define("GGML_USE_METAL")
 ]
 #else
 let platforms: [SupportedPlatform]? = nil
 let exclude: [String] = ["ggml-metal.metal"]
 let resources: [Resource] = []
 let additionalSources: [String] = []
 let additionalSettings: [CSetting] = []
 #endif
@ -38,15 +41,20 @@ let package = Package(
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "k_quants.c",
            ] + additionalSources,
            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32"]),
+                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                .define("GGML_USE_K_QUANTS"),
-                .define("GGML_USE_ACCELERATE"),
+                .define("GGML_USE_ACCELERATE")
-                .define("ACCELERATE_NEW_LAPACK"),
+                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                .define("ACCELERATE_LAPACK_ILP64")
+                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
            ] + additionalSettings,
            linkerSettings: [
                .linkedFramework("Accelerate")
--- a/README.md
+++ b/README.md
@ -5,12 +5,13 @@
 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
 - ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
 - Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
  **Devs should become familiar with the new API**
 - Local Falcon 180B inference on Mac Studio
@ -94,6 +95,9 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
 - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
 - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 **Bindings:**
@ -275,7 +279,7 @@ In order to build llama.cpp you have three different options.
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
 To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
-When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
+When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
 argument.
 ### MPI Build
@ -376,7 +380,7 @@ Building the program with BLAS support may lead to some performance improvements
 - #### cuBLAS
-  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
  - Using `make`:
    ```bash
    make LLAMA_CUBLAS=1
@ -612,6 +616,18 @@ For more information, see [https://huggingface.co/docs/transformers/perplexity](
 The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
 The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
 #### How to run
 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
 24.43 seconds per pass - ETA 4.45 hours
 [1]4.5970,[2]5.1807,[3]6.0382,...
 ```
 And after 4.45 hours, you will have the final perplexity.
 ### Interactive mode
 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
@ -774,18 +790,6 @@ If your issue is with model generation quality, then please at least scan the fo
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
 #### How to run
 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
 24.43 seconds per pass - ETA 4.45 hours
 [1]4.5970,[2]5.1807,[3]6.0382,...
 ```
 And after 4.45 hours, you will have the final perplexity.
 ### Android
 #### Building the Project using Android NDK
--- a/build.zig
+++ b/build.zig
@ -36,14 +36,17 @@ const Maker = struct {
    }
    fn init(builder: *std.build.Builder) !Maker {
        // const commit_hash = @embedFile(".git/refs/heads/master");
        const target = builder.standardTargetOptions(.{});
        const zig_version = @import("builtin").zig_version_string;
        const commit_hash = try std.ChildProcess.exec(
            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
        );
        const config_header = builder.addConfigHeader(
            .{ .style = .blank, .include_path = "build-info.h" },
            .{
                .BUILD_NUMBER = 0,
-                .BUILD_COMMIT = "12345", // omit newline
+                .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
-                .BUILD_COMPILER = "Zig 0.11.0",
+                .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
                .BUILD_TARGET = try target.allocDescription(builder.allocator),
            },
        );
@ -67,12 +70,20 @@ const Maker = struct {
    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (o.target.getAbi() != .msvc)
            o.defineCMacro("_GNU_SOURCE", null);
        o.addConfigHeader(m.config_header);
        if (std.mem.endsWith(u8, src, ".c")) {
            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
        } else {
            o.addCSourceFiles(&.{src}, m.cxxflags.items);
-            o.linkLibCpp();
+            if (o.target.getAbi() == .msvc) {
                o.linkLibC(); // need winsdk + crt
            } else {
                // linkLibCpp already add (libc++ + libunwind + libc)
                o.linkLibCpp();
            }
        }
        o.addConfigHeader(m.config_header);
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
@ -86,8 +97,14 @@ const Maker = struct {
        for (deps) |d| e.addObject(d);
        for (m.objs.items) |o| e.addObject(o);
        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-        e.linkLibC();
+
-        e.linkLibCpp();
+        // https://github.com/ziglang/zig/issues/15448
        if (e.target.getAbi() == .msvc) {
            e.linkLibC(); // need winsdk + crt
        } else {
            // linkLibCpp already add (libc++ + libunwind + libc)
            e.linkLibCpp();
        }
        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
        e.want_lto = m.enable_lto;
@ -107,18 +124,22 @@ pub fn build(b: *std.build.Builder) !void {
    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const llama = make.obj("llama", "llama.cpp");
    const common = make.obj("common", "common/common.cpp");
-    const console = make.obj("common", "common/console.cpp");
+    const console = make.obj("console", "common/console.cpp");
    const sampling = make.obj("sampling", "common/sampling.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/run.sh
+++ b/ci/run.sh
@ -496,10 +496,12 @@ test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    if [ -z ${GG_BUILD_CUDA} ]; then
+    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        test $ret -eq 0 && gg_run open_llama_3b_v2
+        if [ -z ${GG_BUILD_CUDA} ]; then
-    else
+            test $ret -eq 0 && gg_run open_llama_3b_v2
-        test $ret -eq 0 && gg_run open_llama_7b_v2
+        else
            test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
    fi
 fi
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -5,6 +5,8 @@ set(TARGET common)
 add_library(${TARGET} OBJECT
    common.h
    common.cpp
    sampling.h
    sampling.cpp
    console.h
    console.cpp
    grammar-parser.h
--- a/common/common.cpp
+++ b/common/common.cpp
@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    std::string arg;
    gpt_params default_params;
    const std::string arg_prefix = "--";
    llama_sampling_params & sparams = params.sampling_params;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@ -167,8 +168,10 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
            // store the external file name in params
            params.prompt_file = argv[i];
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
+            if (!params.prompt.empty() && params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        } else if (arg == "-n" || arg == "--n-predict") {
@ -182,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.top_k = std::stoi(argv[i]);
+            sparams.top_k = std::stoi(argv[i]);
        } else if (arg == "-c" || arg == "--ctx-size") {
            if (++i >= argc) {
                invalid_param = true;
@ -214,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.top_p = std::stof(argv[i]);
+            sparams.top_p = std::stof(argv[i]);
        } else if (arg == "--temp") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.temp = std::stof(argv[i]);
+            sparams.temp = std::stof(argv[i]);
        } else if (arg == "--tfs") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.tfs_z = std::stof(argv[i]);
+            sparams.tfs_z = std::stof(argv[i]);
        } else if (arg == "--typical") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.typical_p = std::stof(argv[i]);
+            sparams.typical_p = std::stof(argv[i]);
        } else if (arg == "--repeat-last-n") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.repeat_last_n = std::stoi(argv[i]);
+            sparams.repeat_last_n = std::stoi(argv[i]);
        } else if (arg == "--repeat-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.repeat_penalty = std::stof(argv[i]);
+            sparams.repeat_penalty = std::stof(argv[i]);
        } else if (arg == "--frequency-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.frequency_penalty = std::stof(argv[i]);
+            sparams.frequency_penalty = std::stof(argv[i]);
        } else if (arg == "--presence-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.presence_penalty = std::stof(argv[i]);
+            sparams.presence_penalty = std::stof(argv[i]);
        } else if (arg == "--mirostat") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.mirostat = std::stoi(argv[i]);
+            sparams.mirostat = std::stoi(argv[i]);
        } else if (arg == "--mirostat-lr") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.mirostat_eta = std::stof(argv[i]);
+            sparams.mirostat_eta = std::stof(argv[i]);
        } else if (arg == "--mirostat-ent") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.mirostat_tau = std::stof(argv[i]);
+            sparams.mirostat_tau = std::stof(argv[i]);
        } else if (arg == "--cfg-negative-prompt") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.cfg_negative_prompt = argv[i];
+            sparams.cfg_negative_prompt = argv[i];
        } else if (arg == "--cfg-negative-prompt-file") {
            if (++i >= argc) {
                invalid_param = true;
@ -292,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-            if (params.cfg_negative_prompt.back() == '\n') {
+            if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-                params.cfg_negative_prompt.pop_back();
+                sparams.cfg_negative_prompt.pop_back();
            }
        } else if (arg == "--cfg-scale") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.cfg_scale = std::stof(argv[i]);
+            sparams.cfg_scale = std::stof(argv[i]);
        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
@ -361,7 +364,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
            params.use_mmap = false;
        } else if (arg == "--lora-scaled") {
            if (++i >= argc) {
@ -373,7 +376,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
+            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
            params.use_mmap = false;
        } else if (arg == "--lora-base") {
            if (++i >= argc) {
@ -381,6 +384,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.lora_base = argv[i];
        } else if (arg == "--mmproj") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.mmproj = argv[i];
        } else if (arg == "--image") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.image = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--embedding") {
@ -389,6 +404,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.interactive_first = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
        } else if (arg == "--infill") {
            params.infill = true;
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@ -508,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
-            params.penalize_nl = false;
+            sparams.penalize_nl = false;
        } else if (arg == "-l" || arg == "--logit-bias") {
            if (++i >= argc) {
                invalid_param = true;
@ -520,7 +537,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            std::string value_str;
            try {
                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                    sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
                } else {
                    throw std::exception();
                }
@ -614,12 +631,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
        process_escapes(params.input_suffix);
        for (auto & antiprompt : params.antiprompt) {
            process_escapes(antiprompt);
        }
    }
    return true;
 }
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const llama_sampling_params & sparams = params.sampling_params;
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
    printf("options:\n");
@ -652,19 +674,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
    printf("  --mirostat N          use Mirostat sampling.\n");
    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
    printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
    printf("                        modifies the likelihood of token appearing in the completion,\n");
    printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
@ -675,7 +697,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        negative prompt to use for guidance. (default: empty)\n");
    printf("  --cfg-negative-prompt-file FNAME\n");
    printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
@ -683,7 +705,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
+    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@ -693,6 +715,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
    if (llama_mlock_supported()) {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
@ -833,7 +857,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    }
    if (params.ignore_eos) {
-        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
    }
    {
@ -921,129 +945,10 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
        result += piece;
    }
    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
    return result;
 }
 //
 // Sampling utils
 //
 llama_token llama_sample_token(
                  struct llama_context * ctx,
                  struct llama_context * ctx_guidance,
                  struct llama_grammar * grammar,
               const struct gpt_params & params,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
                                   int   idx) {
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
    const float   repeat_penalty  = params.repeat_penalty;
    const float   alpha_presence  = params.presence_penalty;
    const float   alpha_frequency = params.frequency_penalty;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
    const bool    penalize_nl     = params.penalize_nl;
    llama_token id = 0;
    float * logits = llama_get_logits_ith(ctx, idx);
    // Apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
    candidates.clear();
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
    if (ctx_guidance) {
        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
    }
    // apply penalties
    if (!last_tokens.empty()) {
        const float nl_logit = logits[llama_token_nl(ctx)];
        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
        llama_sample_repetition_penalty(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, repeat_penalty);
        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, alpha_frequency, alpha_presence);
        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
                    cur_p.data[idx].logit = nl_logit;
                    break;
                }
            }
        }
    }
    if (grammar != NULL) {
        llama_sample_grammar(ctx, &cur_p, grammar);
    }
    if (temp <= 0) {
        // Greedy sampling
        id = llama_sample_token_greedy(ctx, &cur_p);
    } else {
        if (mirostat == 1) {
            static float mirostat_mu = 2.0f * mirostat_tau;
            const int mirostat_m = 100;
            llama_sample_temp(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
        } else if (mirostat == 2) {
            static float mirostat_mu = 2.0f * mirostat_tau;
            llama_sample_temp(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
        } else {
            // Temperature sampling
            llama_sample_top_k      (ctx, &cur_p, top_k, 1);
            llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
            llama_sample_typical    (ctx, &cur_p, typical_p, 1);
            llama_sample_top_p      (ctx, &cur_p, top_p, 1);
            llama_sample_temp(ctx, &cur_p, temp);
            {
                const int n_top = 10;
                LOG("top %d candidates:\n", n_top);
                for (int i = 0; i < n_top; i++) {
                    const llama_token id = cur_p.data[i].id;
                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
                }
            }
            id = llama_sample_token(ctx, &cur_p);
            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
        }
    }
    // printf("`%d`", candidates_p.size);
    if (grammar != NULL) {
        llama_grammar_accept_token(ctx, grammar, id);
    }
    return id;
 }
 //
 // YAML utils
 //
@ -1195,6 +1100,8 @@ std::string get_sortable_timestamp() {
 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
    const llama_sampling_params & sparams = params.sampling_params;
    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
@ -1241,21 +1148,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
+    dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
+    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
+    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
-    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
+    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
    dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
@ -1268,7 +1175,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
    fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : params.logit_bias) {
+    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
        if (ignore_eos && lb.first == logit_bias_eos->first) {
            continue;
        }
@ -1292,30 +1199,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
-    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
+    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
-    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
+    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
-    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
+    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
-    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
+    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
    fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
-    fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
+    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
    dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
    dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
    fprintf(stream, "reverse_prompt:\n");
    for (std::string ap : params.antiprompt) {
@ -1333,15 +1240,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
-    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
+    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
-    fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
+    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
-    fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
+    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
-    fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
+    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
+    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
--- a/common/common.h
+++ b/common/common.h
@ -4,6 +4,8 @@
 #include "llama.h"
 #include "sampling.h"
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
@ -49,36 +51,18 @@ struct gpt_params {
    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 0.0f; // RoPE base frequency
    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
-    // sampling parameters
+    // // sampling parameters
-    int32_t top_k             = 40;    // <= 0 to use vocab size
+    struct llama_sampling_params sampling_params;
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
    float   repeat_penalty    = 1.10f; // 1.0 = disabled
    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float   frequency_penalty = 0.00f; // 0.0 = disabled
    float   presence_penalty  = 0.00f; // 0.0 = disabled
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
    std::string cfg_negative_prompt;       // string to help guidance
    float       cfg_scale         = 1.f;   // How strong is guidance
    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_draft       = "";                              // draft model for speculative decoding
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string prompt_file       = "";  // store the external prompt file name
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
@ -114,12 +98,16 @@ struct gpt_params {
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool infill            = false; // use infill mode
    // multimodal models (see examples/llava)
    std::string mmproj = ""; // path to multimodal projector
    std::string image = ""; // path to an image file
 };
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@ -178,36 +166,6 @@ std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
 //
 // Sampling utils
 //
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 //
 // required:
 //  - ctx:    context to use for sampling
 //  - params: sampling parameters
 //
 // optional:
 //  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
 //  - grammar:       grammar to use for sampling, ignore if NULL
 //  - last_tokens:   needed for repetition penalty, ignore if empty
 //  - idx:           sample from llama_get_logits_ith(ctx, idx)
 //
 // returns:
 //  - token:      sampled token
 //  - candidates: vector of candidate tokens
 //
 llama_token llama_sample_token(
                  struct llama_context * ctx,
                  struct llama_context * ctx_guidance,
                  struct llama_grammar * grammar,
               const struct gpt_params & params,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
                                   int   idx = 0);
 //
 // YAML utils
 //
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -0,0 +1,166 @@
 #include "sampling.h"
 llama_sampling_context::~llama_sampling_context() {
    for (auto & it : sequence_contexts) {
        if (it.second.grammar != NULL) {
            llama_grammar_free(it.second.grammar);
            it.second.grammar = NULL;
        }
    }
 }
 llama_sampling_context llama_sampling_context_init(
        const struct gpt_params & params,
                  llama_grammar * grammar) {
  llama_sampling_context result;
  result.params = params.sampling_params;
  result.grammar = grammar;
  return result;
 }
 // Note: Creates the context if it doesn't exist, so this always return something.
 llama_sampler_sequence_context & llama_sampling_get_sequence_context(
              llama_sampling_context & ctx_sampling,
        const llama_seq_id             seq) {
    const auto it = ctx_sampling.sequence_contexts.find(seq);
    if (it != ctx_sampling.sequence_contexts.end()) {
        return it->second;
    }
    llama_sampler_sequence_context new_ctx = {
        2.0f * ctx_sampling.params.mirostat_tau,
        ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
    };
    return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
 }
 bool llama_sampling_context_reset(
              llama_sampling_context & ctx_sampling,
        const llama_seq_id             seq) {
    const auto it = ctx_sampling.sequence_contexts.find(seq);
    if (it == ctx_sampling.sequence_contexts.end()) return false;
    if (it->second.grammar != NULL) {
        llama_grammar_free(it->second.grammar);
        it->second.grammar = NULL;
    }
    ctx_sampling.sequence_contexts.erase(it);
    return true;
 }
 llama_token llama_sampling_sample(
                  struct llama_context * ctx,
                  struct llama_context * ctx_guidance,
                  struct llama_sampling_context & ctx_sampling,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
        const                      int   idx,
                          llama_seq_id   seq) {
    const int n_ctx   = llama_n_ctx(ctx);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
    const llama_sampling_params & params = ctx_sampling.params;
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
    const float   repeat_penalty  = params.repeat_penalty;
    const float   alpha_presence  = params.presence_penalty;
    const float   alpha_frequency = params.frequency_penalty;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
    const bool    penalize_nl     = params.penalize_nl;
    llama_token id = 0;
    float * logits = llama_get_logits_ith(ctx, idx);
    // Apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }
    candidates.clear();
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
    if (ctx_guidance) {
        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
    }
    // apply penalties
    if (!last_tokens.empty()) {
        const float nl_logit = logits[llama_token_nl(ctx)];
        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
        llama_sample_repetition_penalty(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, repeat_penalty);
        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
                last_tokens.data() + last_tokens.size() - last_n_repeat,
                last_n_repeat, alpha_frequency, alpha_presence);
        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
                    cur_p.data[idx].logit = nl_logit;
                    break;
                }
            }
        }
    }
    llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
    if (ctx_seq.grammar != NULL) {
        llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
    }
    if (temp <= 0) {
        // Greedy sampling
        id = llama_sample_token_greedy(ctx, &cur_p);
    } else {
        if (mirostat == 1) {
            const int mirostat_m = 100;
            llama_sample_temp(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
        } else if (mirostat == 2) {
            llama_sample_temp(ctx, &cur_p, temp);
            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
        } else {
            // Temperature sampling
            size_t min_keep = std::max(1, params.n_probs);
            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
            llama_sample_temp(ctx, &cur_p, temp);
            {
                const int n_top = 10;
                LOG("top %d candidates:\n", n_top);
                for (int i = 0; i < n_top; i++) {
                    const llama_token id = cur_p.data[i].id;
                    (void)id; // To avoid a warning that id is unused when logging is disabled.
                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
                }
            }
            id = llama_sample_token(ctx, &cur_p);
            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
        }
    }
    if (ctx_seq.grammar != NULL) {
        llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
    }
    return id;
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -0,0 +1,108 @@
 #pragma once
 #include "llama.h"
 #include <string>
 #include <vector>
 #include <unordered_map>
 // sampling parameters
 typedef struct llama_sampling_params {
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
    float   repeat_penalty    = 1.10f; // 1.0 = disabled
    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float   frequency_penalty = 0.00f; // 0.0 = disabled
    float   presence_penalty  = 0.00f; // 0.0 = disabled
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate
    bool    penalize_nl       = true;  // consider newlines as a repeatable token
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
    std::string cfg_negative_prompt;   // string to help guidance
    float       cfg_scale     = 1.f;   // How strong is guidance
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
 } llama_sampling_params;
 // per-sequence sampler context
 typedef struct llama_sampler_sequence_context {
    float mirostat_mu; // mirostat sampler state
    llama_grammar * grammar;
 } llama_sampler_sequence_context;
 // general sampler context
 typedef struct llama_sampling_context {
    ~llama_sampling_context();
    // parameters that will be used for sampling and when creating
    // new llama_sampler_sequence_context instances
    llama_sampling_params params;
    // map of sequence ids to sampler contexts
    std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
    // when non-NULL, new instances of llama_sampler_sequence_context
    // will get a copy of the grammar here
    // note: only the pointer is stored here, it is not a copy of
    //       the grammar and shouldn't be freed
    llama_grammar * grammar;
 } llama_sampling_context;
 #include "common.h"
 // Create a new sampling context instance.
 llama_sampling_context llama_sampling_context_init(
        const struct gpt_params & params,
                  llama_grammar * grammar = NULL);
 // Fetches the sampler context for the specified sequence id (defaults to 0).
 // If the context for that sequence id doesn't already exist, it will be created with
 // default values based on the parameters in the ctx_sampling argument.
 llama_sampler_sequence_context & llama_sampling_get_sequence_context(
              llama_sampling_context & ctx_sampling,
        const llama_seq_id             seq = 0);
 // Reset the sampler context for the supplied sequence id (defaults to 0).
 // This is necessary to reuse a sequence id or free memory used by sequences
 // that are no longer required.
 bool llama_sampling_context_reset(
              llama_sampling_context & ctx_sampling,
        const llama_seq_id             seq = 0);
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
 //       llama_sampling_context_reset when a sequence ends
 //
 // required:
 //  - ctx:          context to use for sampling
 //  - ctx_sampling: sampling-specific context
 //
 // optional:
 //  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
 //  - last_tokens:   needed for repetition penalty, ignore if empty
 //  - idx:           sample from llama_get_logits_ith(ctx, idx)
 //  - seq:           sequence id to associate sampler state with
 //
 // returns:
 //  - token:      sampled token
 //  - candidates: vector of candidate tokens
 //
 llama_token llama_sampling_sample(
                  struct llama_context * ctx,
                  struct llama_context * ctx_guidance,
                  struct llama_sampling_context & ctx_sampling,
        const std::vector<llama_token> & last_tokens,
         std::vector<llama_token_data> & candidates,
        const                      int   idx = 0,
                          llama_seq_id   seq = 0);
--- a/common/stb_image.h
+++ b/common/stb_image.h
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@ -11,11 +11,14 @@ import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 import itertools
 import gguf
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 if TYPE_CHECKING:
    from typing import TypeAlias
@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
 print("gguf: get sentencepiece tokenizer vocab, scores and token types")
 tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
 vocab_size = hparams.get('vocab_size')
 if vocab_size is None:
    vocab_size = tokenizer.vocab_size()
-for i in range(tokenizer.vocab_size()):
+for i in range(vocab_size):
    text: bytes
    score: float
--- a/convert-bloom-hf-to-gguf.py
+++ b/convert-bloom-hf-to-gguf.py
@ -0,0 +1,238 @@
 #!/usr/bin/env python3
 # HF bloom --> gguf conversion
 from __future__ import annotations
 import argparse
 import json
 import os
 import re
 import struct
 import sys
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 # Supported Models:
 #   https://huggingface.co/bigscience/bloom-1b7
 #   https://huggingface.co/bigscience/bloom-3b
 #   https://huggingface.co/bigscience/bloom-7b1
 #   https://huggingface.co/Langboat/bloom-1b4-zh
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
 print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "BloomForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit(1)
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.BLOOM
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["n_layer"]
 gguf_writer.add_name("Bloom")
 n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
 n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
 gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
 gguf_writer.add_embedding_length(n_embed)
 gguf_writer.add_feed_forward_length(4 * n_embed)
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_head_count(n_head)
 gguf_writer.add_head_count_kv(n_head)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 print("gguf: get gpt2 tokenizer vocab")
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
    scores.append(0.0)  # dummy
    toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
 # params for qkv transform
 n_head_kv = hparams.get("n_head_kv", n_head)
 head_dim = n_embed // n_head
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(dir_model / part_name, map_location="cpu")
    has_lm_head = True
    if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
        has_lm_head = False
    for original_name in model_part.keys():
        data = model_part[original_name]
        name = re.sub(r'transformer\.', '', original_name)
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
            # Map bloom-style qkv_linear to gpt-style qkv_linear
            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
            qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
            data = np.concatenate(
                (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
                 qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
                 qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
                axis=0
            )
            print("re-format attention.linear_qkv.weight")
        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
            qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
            data = np.concatenate(
                (qkv_bias[:, 0, :].reshape((n_embed,)),
                 qkv_bias[:, 1, :].reshape((n_embed,)),
                 qkv_bias[:, 2, :].reshape((n_embed,))),
                axis=0
            )
            print("re-format attention.linear_qkv.bias")
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
        if not has_lm_head and name == "word_embeddings.weight":
            gguf_writer.add_tensor("output.weight", data)
            print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))  # noqa
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -4,6 +4,7 @@
 from __future__ import annotations
 import argparse
 import contextlib
 import json
 import os
 import struct
@ -20,32 +21,10 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf
-def bytes_to_unicode():
+def count_model_parts(dir_model: Path, prefix: str) -> int:
    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    return dict(zip(bs, (chr(n) for n in cs)))
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
+        if filename.startswith(prefix):
            num_parts += 1
    if num_parts > 0:
@ -99,20 +78,26 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
-if hparams["architectures"][0] != "RWForCausalLM":
+if hparams["architectures"][0] != "FalconForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit(1)
 # get number of model parts
-num_parts = count_model_parts(dir_model)
+num_parts = count_model_parts(dir_model, "model-00")
 if num_parts:
    is_safetensors = True
    from safetensors import safe_open
 else:
    is_safetensors = False
    num_parts = count_model_parts(dir_model, "pytorch_model-")
 ARCH=gguf.MODEL_ARCH.FALCON
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
-block_count = hparams["n_layer"]
+block_count = hparams["num_hidden_layers"]
 gguf_writer.add_name("Falcon")
 gguf_writer.add_context_length(2048) # not in config.json
@ -120,9 +105,9 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["n_head"])
+gguf_writer.add_head_count(hparams["num_attention_heads"])
-if "n_head_kv" in hparams:
+if "num_kv_heads" in hparams:
-    gguf_writer.add_head_count_kv(hparams["n_head_kv"])
+    gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
 else:
    gguf_writer.add_head_count_kv(1)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
@ -133,50 +118,32 @@ gguf_writer.add_file_type(ftype)
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
-
+scores: list[float] = []
-tokenizer_json_file = dir_model / 'tokenizer.json'
+toktypes: list[int] = []
 if not tokenizer_json_file.is_file():
    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
    sys.exit(1)
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 with open(tokenizer_json_file, "r", encoding="utf-8") as f:
    tokenizer_json = json.load(f)
 print("gguf: get gpt2 tokenizer vocab")
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 byte_encoder = bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 for i in range(vocab_size):
-    if i in reverse_vocab:
+    tokens.append(reverse_vocab[i])
-        try:
+    scores.append(0.0) # dummy
-            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+    toktypes.append(gguf.TokenType.NORMAL)
        except KeyError:
            text = bytearray()
            for c in reverse_vocab[i]:
                if ord(c) < 256:  # single byte character
                    text.append(byte_decoder[ord(c)])
                else:  # multibyte special token character
                    text.extend(c.encode('utf-8'))
    else:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    tokens.append(text)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
@ -186,8 +153,8 @@ special_vocab.add_to_gguf(gguf_writer)
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # params for qkv transform
-n_head    = hparams["n_head"]
+n_head    = hparams["num_attention_heads"]
-n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
+n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
 head_dim = hparams["hidden_size"] // n_head
@ -196,6 +163,10 @@ print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 elif is_safetensors:
    part_names = (
        f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
    )
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
@ -205,60 +176,64 @@ for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
+    if is_safetensors:
        ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
    else:
        ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
-    for name in model_part.keys():
+    with ctx as model_part:
-        data = model_part[name]
+        for name in model_part.keys():
            data = model_part.get_tensor(name) if is_safetensors else model_part[name]
-        old_dtype = data.dtype
+            old_dtype = data.dtype
-        # convert any unsupported data types to float32
+            # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
+                data = data.to(torch.float32)
-        # QKV tensor transform
+            # QKV tensor transform
-        # The original query_key_value tensor contains n_head_kv "kv groups",
+            # The original query_key_value tensor contains n_head_kv "kv groups",
-        # each consisting of n_head/n_head_kv query weights followed by one key
+            # each consisting of n_head/n_head_kv query weights followed by one key
-        # and one value weight (shared by all query heads in the kv group).
+            # and one value weight (shared by all query heads in the kv group).
-        # This layout makes it a big pain to work with in GGML.
+            # This layout makes it a big pain to work with in GGML.
-        # So we rearrange them here,, so that we have n_head query weights
+            # So we rearrange them here,, so that we have n_head query weights
-        # followed by n_head_kv key weights followed by n_head_kv value weights,
+            # followed by n_head_kv key weights followed by n_head_kv value weights,
-        # in contiguous fashion.
+            # in contiguous fashion.
-        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+            # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
-        if "query_key_value" in name:
+            if "query_key_value" in name:
-            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+                qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
-            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+                q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
-            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+                k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
-            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+                v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
-            data = torch.cat((q,k,v)).reshape_as(data)
+                data = torch.cat((q,k,v)).reshape_as(data)
-        data = data.squeeze().numpy()
+            data = data.squeeze().numpy()
-        # map tensor names
+            # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+            new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
+            if new_name is None:
-            print("Can not map tensor '" + name + "'")
+                print("Can not map tensor '" + name + "'")
-            sys.exit()
+                sys.exit()
-        n_dims = len(data.shape)
+            n_dims = len(data.shape)
-        data_dtype = data.dtype
+            data_dtype = data.dtype
-        # if f32 desired, convert any float16 to float32
+            # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
+            if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
+                data = data.astype(np.float32)
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
+                data = data.astype(np.float32)
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
+                data = data.astype(np.float16)
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+            print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-        gguf_writer.add_tensor(new_name, data)
+            gguf_writer.add_tensor(new_name, data)
 print("gguf: write header")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@ -19,29 +19,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    return dict(zip(bs, (chr(n) for n in cs)))
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
@ -130,48 +107,32 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
-
+scores: list[float] = []
-tokenizer_json_file = dir_model / 'tokenizer.json'
+toktypes: list[int] = []
 if not tokenizer_json_file.is_file():
    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
    sys.exit(1)
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 with open(tokenizer_json_file, "r", encoding="utf-8") as f:
    tokenizer_json = json.load(f)
 print("gguf: get gpt2 tokenizer vocab")
 vocab_size = len(tokenizer_json["model"]["vocab"])
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 byte_encoder = bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 for i in range(vocab_size):
-    if i in reverse_vocab:
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-        try:
+    scores.append(0.0) # dummy
-            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+    toktypes.append(gguf.TokenType.NORMAL)
        except KeyError:
            text = bytearray()
            for c in reverse_vocab[i]:
                if ord(c) < 256:  # single byte character
                    text.append(byte_decoder[ord(c)])
                else:  # multibyte special token character
                    text.extend(c.encode('utf-8'))
    else:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    tokens.append(text)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@ -0,0 +1,216 @@
 #!/usr/bin/env python3
 # HF mpt--> gguf conversion
 from __future__ import annotations
 import argparse
 import json
 import os
 import struct
 import sys
 from pathlib import Path
 from typing import Any
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
    parser.add_argument(
        "--vocab-only", action="store_true",
        help="extract only the vocab",
    )
    parser.add_argument(
        "--outfile", type=Path,
        help="path to write to; default: based on input",
    )
    parser.add_argument(
        "model", type=Path,
        help="directory containing model file, or model file itself (*.bin)",
    )
    parser.add_argument(
        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
        help="output format - use 0 for float32, 1 for float16",
    )
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
 print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "MPTForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH=gguf.MODEL_ARCH.MPT
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 block_count = hparams["n_layers"]
 gguf_writer.add_name(dir_model.name)
 gguf_writer.add_context_length(hparams["max_seq_len"])
 gguf_writer.add_embedding_length(hparams["d_model"])
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
 gguf_writer.add_head_count(hparams["n_heads"])
 gguf_writer.add_layer_norm_eps(1e-05)
 if hparams["attn_config"]["clip_qkv"] is not None:
    gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
 gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 print("gguf: get gpt2 tokenizer vocab")
 # MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
 # there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
 # accomodate some "reserved" tokens; this is causing problems down the line in
 # llama.cpp, so we pad the vocab with dummy tokens:
 vocab_size = hparams["vocab_size"]
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
    scores.append(0.0) # dummy
    toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
    for name in model_part.keys():
        data = model_part[name]
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Cannot map tensor '" + name + "'")
            continue # for the sake of compatibility with some old published models, don't quit
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)
        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
        # note: MPT output is tied to (same as) wte in original model;
        # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
        if new_name == "token_embd.weight":
            gguf_writer.add_tensor("output.weight", data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -0,0 +1,130 @@
 import torch
 import os
 from pprint import pprint
 import sys
 import argparse
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 def _flatten_dict(dct, tensors, prefix=None):
    assert isinstance(dct, dict)
    for key in dct.keys():
        new_prefix = prefix + '.' + key if prefix is not None else key
        if isinstance(dct[key], torch.Tensor):
            tensors[new_prefix] = dct[key]
        elif isinstance(dct[key], dict):
            _flatten_dict(dct[key], tensors, new_prefix)
        else:
            raise ValueError(type(dct[key]))
    return None
 def _get_sentencepiece_tokenizer_info(dir_model: Path):
    tokenizer_path = dir_model / 'adept_vocab.model'
    print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
    tokenizer = SentencePieceProcessor(str(tokenizer_path))
    print('gguf: adding tokens')
    tokens: list[bytes] = []
    scores: list[float] = []
    toktypes: list[int] = []
    for i in range(tokenizer.vocab_size()):
        text: bytes
        score: float
        piece = tokenizer.id_to_piece(i)
        text = piece.encode("utf-8")
        score = tokenizer.get_score(i)
        toktype = 1
        if tokenizer.is_unknown(i):
            toktype = 2
        if tokenizer.is_control(i):
            toktype = 3
        if tokenizer.is_unused(i):
            toktype = 5
        if tokenizer.is_byte(i):
            toktype = 6
        tokens.append(text)
        scores.append(score)
        toktypes.append(toktype)
        pass
    return tokens, scores, toktypes
 def main():
    parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
    parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
    args = parser.parse_args()
    sys.path.append(str(args.adept_inference_dir))
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
    tensors = {}
    _flatten_dict(persimmon_model['model'], tensors, None)
    arch = gguf.MODEL_ARCH.PERSIMMON
    gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
    block_count = hparams.num_layers
    head_count = hparams.num_attention_heads
    head_count_kv = head_count
    ctx_length = hparams.seq_length
    hidden_size = hparams.hidden_size
    gguf_writer.add_name('persimmon-8b-chat')
    gguf_writer.add_context_length(ctx_length)
    gguf_writer.add_embedding_length(hidden_size)
    gguf_writer.add_block_count(block_count)
    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
    gguf_writer.add_rope_dimension_count(hidden_size // head_count)
    gguf_writer.add_head_count(head_count)
    gguf_writer.add_head_count_kv(head_count_kv)
    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
    gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
    gguf_writer.add_tokenizer_model('llama')
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)
    gguf_writer.add_bos_token_id(71013)
    gguf_writer.add_eos_token_id(71013)
    tensor_map = gguf.get_tensor_name_map(arch, block_count)
    print(tensor_map)
    for name in tensors.keys():
        data = tensors[name]
        if name.endswith(".self_attention.rotary_emb.inv_freq"):
            continue
        old_dtype = data.dtype
        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
        data = data.to(torch.float32).squeeze().numpy()
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
    print("gguf: write header")
    gguf_writer.write_header_to_file()
    print("gguf: write metadata")
    gguf_writer.write_kv_data_to_file()
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
    gguf_writer.close()
    print(f"gguf: model successfully exported to '{args.outfile}'")
    print("")
 if __name__ == '__main__':
    main()
--- a/convert-refact-hf-to-gguf.py
+++ b/convert-refact-hf-to-gguf.py
@ -0,0 +1,263 @@
 #!/usr/bin/env python3
 # HF refact--> gguf conversion
 from __future__ import annotations
 import argparse
 import json
 import os
 import sys
 from pathlib import Path
 import numpy as np
 import torch
 from transformers import AutoTokenizer  # type: ignore[import]
 if "NO_LOCAL_GGUF" not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
 import gguf
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
            num_parts += 1
    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Convert a Refact model to a GGML compatible file"
    )
    parser.add_argument(
        "--vocab-only",
        action="store_true",
        help="extract only the vocab",
    )
    parser.add_argument(
        "--outfile",
        type=Path,
        help="path to write to; default: based on input",
    )
    parser.add_argument(
        "model",
        type=Path,
        help="directory containing model file, or model file itself (*.bin)",
    )
    parser.add_argument(
        "ftype",
        type=int,
        choices=[0, 1],
        default=1,
        nargs="?",
        help="output format - use 0 for float32, 1 for float16",
    )
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 ftype = args.ftype
 if not dir_model.is_dir():
    print(f"Error: {args.model} is not a directory", file=sys.stderr)
    sys.exit(1)
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
 print("gguf: loading model " + dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 if hparams["architectures"][0] != "GPTRefactForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit(1)
 # get number of model parts
 num_parts = count_model_parts(dir_model)
 ARCH = gguf.MODEL_ARCH.REFACT
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 # Get refact feed forward dimension
 hidden_dim = hparams["n_embd"]
 inner_dim = 4 * hidden_dim
 hidden_dim = int(2 * inner_dim / 3)
 multiple_of = 256
 ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
 block_count = hparams["n_layer"]
 gguf_writer.add_name("Refact")
 # refact uses Alibi. So this is from config.json which might be used by training.
 gguf_writer.add_context_length(hparams["n_positions"])
 gguf_writer.add_embedding_length(hparams["n_embd"])
 gguf_writer.add_feed_forward_length(ff_dim)
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_head_count(hparams["n_head"])
 gguf_writer.add_head_count_kv(1)
 gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
 gguf_writer.add_file_type(ftype)
 # TOKENIZATION
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 print("gguf: get gpt2 tokenizer vocab")
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 for i in range(vocab_size):
    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
    scores.append(0.0) # dummy
    toktypes.append(gguf.TokenType.NORMAL)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
 special_vocab.add_to_gguf(gguf_writer)
 # TENSORS
 tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
 # params for qkv transform
 n_head = hparams["n_head"]
 n_head_kv = 1
 head_dim = hparams["n_embd"] // n_head
 # tensor info
 print("gguf: get tensor metadata")
 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )
 for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(dir_model / part_name, map_location="cpu")
    for i in range(block_count):
        if f"transformer.h.{i}.attn.kv.weight" in model_part:
            data = model_part[f"transformer.h.{i}.attn.kv.weight"]
            model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
                : n_head_kv * head_dim
            ]
            model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
                n_head_kv * head_dim :
            ]
            del model_part[f"transformer.h.{i}.attn.kv.weight"]
        if f"transformer.h.{i}.attn.q.weight" in model_part:
            model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
                f"transformer.h.{i}.attn.q.weight"
            ]
            del model_part[f"transformer.h.{i}.attn.q.weight"]
        if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
            data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
            model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
            model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
            del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
    for name in model_part.keys():
        data = model_part[name]
        old_dtype = data.dtype
        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)
        data = data.squeeze().numpy()
        # map tensor names
        new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()
        n_dims = len(data.shape)
        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data_dtype == np.float16:
            data = data.astype(np.float32)
        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data = data.astype(np.float32)
        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if (
            ftype == 1
            and data_dtype == np.float32
            and name.endswith(".weight")
            and n_dims == 2
        ):
            data = data.astype(np.float16)
        print(
            new_name
            + ", n_dims = "
            + str(n_dims)
            + ", "
            + str(old_dtype)
            + " --> "
            + str(data.dtype)
        )
        gguf_writer.add_tensor(new_name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 if not args.vocab_only:
    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@ -20,28 +20,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf
 def bytes_to_unicode():
    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    return dict(zip(bs, (chr(n) for n in cs)))
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
@ -117,50 +95,32 @@ gguf_writer.add_file_type(ftype)
 print("gguf: get tokenizer metadata")
 tokens: list[bytearray] = []
-
+scores: list[float] = []
-tokenizer_json_file = dir_model / 'tokenizer.json'
+toktypes: list[int] = []
 if not tokenizer_json_file.is_file():
    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
    sys.exit(1)
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")
 with open(tokenizer_json_file, "r", encoding="utf-8") as f:
    tokenizer_json = json.load(f)
 print("gguf: get gpt2 tokenizer vocab")
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
 assert max(tokenizer.vocab.values()) < vocab_size
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 byte_encoder = bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 for i in range(vocab_size):
-    if i in reverse_vocab:
+    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-        try:
+    scores.append(0.0) # dummy
-            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+    toktypes.append(gguf.TokenType.NORMAL)
        except KeyError:
            text = bytearray()
            for c in reverse_vocab[i]:
                if ord(c) < 256:  # single byte character
                    text.append(byte_decoder[ord(c)])
                else:  # multibyte special token character
                    text.extend(c.encode('utf-8'))
    else:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    tokens.append(text)
 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
 gguf_writer.add_token_types(toktypes)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
--- a/convert.py
+++ b/convert.py
@ -41,8 +41,7 @@ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
 NDArray: TypeAlias = 'np.ndarray[Any, Any]'
-ARCH=gguf.MODEL_ARCH.LLAMA
+ARCH = gguf.MODEL_ARCH.LLAMA
 NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
 DEFAULT_CONCURRENCY = 8
 #
@ -339,29 +338,15 @@ class BpeVocab:
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.bpe_tokenizer
        from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
-        byte_encoder = tokenization_gpt2.bytes_to_unicode()
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
-        byte_decoder = {v: k for k, v in byte_encoder.items()}
+
-        score = 0.0
+        for i, _ in enumerate(tokenizer):
-        for i, item in enumerate(tokenizer):
+            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
            text: bytes = item.encode("utf-8")
            # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
            if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
                if i == 0 and text == b'<unk>':
                    toktype = gguf.TokenType.UNKNOWN
                elif i == 1 or i == 2:
                    toktype = gguf.TokenType.CONTROL
                elif i >= 3 and text.startswith(b'<0x'):
                    toktype = gguf.TokenType.BYTE
                else:
                    toktype = gguf.TokenType.NORMAL
            else:
                toktype = gguf.TokenType.NORMAL
            yield text, score, toktype
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.bpe_tokens()
@ -953,7 +938,7 @@ class OutputFile:
        of.close()
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
-    wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
+    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
        return GGMLFileType.AllF32
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@ -49,7 +49,7 @@ According to the BLIS documentation, we could set the following
 environment variables to modify the behavior of openmp:
 ```bash
-export GOMP_GPU_AFFINITY="0-19"
+export GOMP_CPU_AFFINITY="0-19"
 export BLIS_NUM_THREADS=14
 ```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -25,9 +25,11 @@ else()
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(batched)
    add_subdirectory(batched-bench)
    add_subdirectory(speculative)
    add_subdirectory(parallel)
    add_subdirectory(embd-input)
    add_subdirectory(llava)
    add_subdirectory(llama-bench)
    add_subdirectory(beam-search)
    if (LLAMA_METAL)
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET batched-bench)
 add_executable(${TARGET} batched-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@ -0,0 +1,51 @@
 # llama.cpp/example/batched-bench
 Benchmark the batched decoding performance of `llama.cpp`
 ## Usage
 There are 2 modes of operation:
 - `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 ```bash
 ./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
 ./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
 ./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
 # custom set of batches
 ./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
 ```
 ## Sample results
 - `PP` - prompt tokens per batch
 - `TG` - generated tokens per batch
 - `B` - number of batches
 - `N_KV` - required KV cache size
 - `T_PP` - prompt processing time (i.e. time to first token)
 - `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
 - `T_TG` - time to generate all batches
 - `S_TG` - text generation speed (`(B*TG)/T_TG`)
 - `T` - total time
 - `S` - total speed (i.e. all tokens / total time)
 |    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
 |-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
 |   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
 |   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
 |   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
 |   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
 |   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
 |   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
 |   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
 |   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
 |   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
 |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
 |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
 |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -0,0 +1,251 @@
 #include "common.h"
 #include "llama.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 // mutates the input string
 static std::vector<int> parse_list(char * p) {
    std::vector<int> ret;
    char * q = p;
    while (*p) {
        if (*p == ',') {
            *p = '\0';
            ret.push_back(std::atoi(q));
            q = p + 1;
        }
        ++p;
    }
    ret.push_back(std::atoi(q));
    return ret;
 }
 int main(int argc, char ** argv) {
    gpt_params params;
    if (argc == 1 || argv[1][0] == '-') {
        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
        printf("  example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
        return 1 ;
    }
    int n_kv_max     = 2048;
    int is_pp_shared = 0;
    int n_gpu_layers = 0;
    int mmq          = 0;
    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
    std::vector<int> n_tg = { 128, 256, };
    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
    if (argc >= 2) {
        params.model = argv[1];
    }
    if (argc >= 3) {
        n_kv_max = std::atoi(argv[2]);
    }
    if (argc >= 4) {
        is_pp_shared = std::atoi(argv[3]);
    }
    if (argc >= 5) {
        n_gpu_layers = std::atoi(argv[4]);
    }
    if (argc >= 6) {
        mmq = std::atoi(argv[5]);
    }
    if (argc >= 7) {
        n_pp = parse_list(argv[6]);
    }
    if (argc >= 8) {
        n_tg = parse_list(argv[7]);
    }
    if (argc >= 9) {
        n_pl = parse_list(argv[8]);
    }
    // init LLM
    llama_backend_init(params.numa);
    // initialize the model
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = n_gpu_layers;
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.seed      = 1234;
    ctx_params.n_ctx     = n_kv_max;
    ctx_params.n_batch   = 512;
    ctx_params.mul_mat_q = mmq;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
    if (ctx == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }
    llama_batch batch = llama_batch_init(n_kv_max, 0);
    // decode in batches of ctx_params.n_batch tokens
    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
            llama_batch batch_view = {
                n_tokens,
                batch.token  + i,
                nullptr,
                batch.pos    + i,
                batch.seq_id + i,
                batch.logits + i,
                0, 0, 0, // unused
            };
            const int ret = llama_decode(ctx, batch_view);
            if (ret != 0) {
                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                return false;
            }
        }
        return true;
    };
    // warm up
    {
        batch.n_tokens = 16;
        for (int i = 0; i < batch.n_tokens; ++i) {
            batch.token[i]  = 0;
            batch.pos[i]    = i;
            batch.seq_id[i] = 0;
            batch.logits[i] = false;
        }
        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }
    }
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
                const int pp = n_pp[i_pp];
                const int tg = n_tg[i_tg];
                const int pl = n_pl[i_pl];
                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
                if (n_ctx_req > n_kv_max) {
                    continue;
                }
                batch.n_tokens = is_pp_shared ? pp : pl*pp;
                for (int i = 0; i < batch.n_tokens; ++i) {
                    batch.token[i]  = 0;
                    batch.pos[i]    = i;
                    batch.seq_id[i] = 0;
                    batch.logits[i] = false;
                }
                batch.logits[batch.n_tokens - 1] = true;
                const auto t_pp_start = ggml_time_us();
                llama_kv_cache_tokens_rm(ctx, -1, -1);
                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                    LOG_TEE("%s: llama_decode() failed\n", __func__);
                    return 1;
                }
                if (is_pp_shared) {
                    for (int32_t i = 1; i < pl; ++i) {
                        llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
                    }
                }
                const auto t_pp_end = ggml_time_us();
                const auto t_tg_start = ggml_time_us();
                for (int i = 0; i < tg; ++i) {
                    batch.n_tokens = pl;
                    for (int j = 0; j < pl; ++j) {
                        batch.token[j]  = 0;
                        batch.pos[j]    = pp + i;
                        batch.seq_id[j] = j;
                        batch.logits[j] = true;
                    }
                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                        LOG_TEE("%s: llama_decode() failed\n", __func__);
                        return 1;
                    }
                }
                const auto t_tg_end = ggml_time_us();
                const int32_t n_kv = n_ctx_req;
                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
                const float t    = t_pp + t_tg;
                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
                const float speed_tg = pl*tg / t_tg;
                const float speed    = n_kv / t;
                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
            }
        }
    }
    llama_print_timings(ctx);
    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
    fprintf(stderr, "\n\n");
    return 0;
 }
--- a/examples/batched.swift/.gitignore
+++ b/examples/batched.swift/.gitignore
@ -0,0 +1,9 @@
 .DS_Store
 /.build
 /Packages
 xcuserdata/
 DerivedData/
 .swiftpm/configuration/registries.json
 .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
 .netrc
 batched_swift
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@ -0,0 +1,6 @@
 .PHONY: build
 build:
 	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
 	rm -f ./batched_swift
 	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@ -0,0 +1,22 @@
 // swift-tools-version: 5.5
 // The swift-tools-version declares the minimum version of Swift required to build this package.
 import PackageDescription
 let package = Package(
    name: "batched_swift",
    platforms: [.macOS(.v12)],
    dependencies: [
        .package(name: "llama", path: "../../"),
    ],
    targets: [
        // Targets are the basic building blocks of a package, defining a module or a test suite.
        // Targets can depend on other targets in this package and products from dependencies.
        .executableTarget(
            name: "batched_swift",
            dependencies: ["llama"],
            path: "Sources",
            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
        ),
    ]
 )
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@ -0,0 +1,4 @@
 This is a swift clone of `examples/batched`.
 $ `make`
 $ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -0,0 +1,255 @@
 import Foundation
 import llama
 let arguments = CommandLine.arguments
 // Check that we have at least one argument (the model path)
 guard arguments.count > 1 else {
    print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
    exit(1)
 }
 let modelPath: String = arguments[1]
 let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
 let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
 // total length of the sequences including the prompt
 let n_len: Int = 32
 // init LLM
 llama_backend_init(false)
 defer {
    llama_backend_free()
 }
 let model_params = llama_model_default_params()
 guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
    print("Failed to load model")
    exit(1)
 }
 defer {
    llama_free_model(model)
 }
 var tokens = tokenize(text: prompt, add_bos: true)
 let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
 var context_params = llama_context_default_params()
 context_params.seed = 1234
 context_params.n_ctx = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8
 let context = llama_new_context_with_model(model, context_params)
 guard context != nil else {
    print("Failed to initialize context")
    exit(1)
 }
 defer {
    llama_free(context)
 }
 let n_ctx = llama_n_ctx(context)
 print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
 if n_kv_req > n_ctx {
    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
    exit(1)
 }
 var buffer: [CChar] = []
 for id: llama_token in tokens {
    print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
 }
 print("\n")
 var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
 defer {
    llama_batch_free(batch)
 }
 // evaluate the initial prompt
 batch.n_tokens = Int32(tokens.count)
 for (i, token) in tokens.enumerated() {
    batch.token[i] = token
    batch.pos[i] = Int32(i)
    batch.seq_id[i] = 0
    batch.logits[i] = 0
 }
 // llama_decode will output logits only for the last token of the prompt
 batch.logits[Int(batch.n_tokens) - 1] = 1
 if llama_decode(context, batch) != 0 {
    print("llama_decode() failed")
    exit(1)
 }
 for i in 1 ..< n_parallel {
    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
 }
 if n_parallel > 1 {
    print("generating \(n_parallel) sequences ...\n")
 }
 var streams: [String] = .init(repeating: "", count: n_parallel)
 var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
 var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
 var n_cur = batch.n_tokens
 var n_decode = 0
 let t_main_start = ggml_time_us()
 while n_cur <= n_len {
    // prepare the next batch
    batch.n_tokens = 0
    // sample the next token for each parallel sequence / stream
    for i in 0 ..< n_parallel {
        if i_batch[i] < 0 {
            // the stream has already finished
            continue
        }
        var n_vocab = llama_n_vocab(model)
        var logits = llama_get_logits_ith(context, i_batch[i])
        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
        for token_id in 0 ..< n_vocab {
            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
        }
        var candidates_p: llama_token_data_array = .init(
            data: &candidates,
            size: candidates.count,
            sorted: false
        )
        let top_k: Int32 = 40
        let top_p: Float = 0.9
        let temp: Float = 0.4
        llama_sample_top_k(context, &candidates_p, top_k, 1)
        llama_sample_top_p(context, &candidates_p, top_p, 1)
        llama_sample_temp(context, &candidates_p, temp)
        let new_token_id = llama_sample_token(context, &candidates_p)
        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
        // is it an end of stream? -> mark the stream as finished
        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
                print("stream \(i) finished at n_cur = \(n_cur)")
            }
            continue
        }
        let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
        // if there is only one stream, we print immediately to stdout
        if n_parallel == 1 {
            print(nextStringPiece, terminator: "")
        }
        streams[i] += nextStringPiece
        // push this new token for next evaluation
        batch.token[Int(batch.n_tokens)] = new_token_id
        batch.pos[Int(batch.n_tokens)] = n_cur
        batch.seq_id[Int(batch.n_tokens)] = Int32(i)
        batch.logits[Int(batch.n_tokens)] = 1
        i_batch[i] = batch.n_tokens
        batch.n_tokens += 1
        n_decode += 1
    }
    // all streams are finished
    if batch.n_tokens == 0 {
        break
    }
    n_cur += 1
    // evaluate the current batch with the transformer model
    if llama_decode(context, batch) != 0 {
        print("llama_decode() failed")
        exit(1)
    }
 }
 if n_parallel > 1 {
    print("\n")
    for (i, stream) in streams.enumerated() {
        print("sequence \(i):\n\n\(prompt)\(stream)\n")
    }
 }
 let t_main_end = ggml_time_us()
 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
 llama_print_timings(context)
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
    let n_tokens = text.count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
    }
    tokens.deallocate()
    return swiftTokens
 }
 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
    if nTokens < 0 {
        if result.count >= -Int(nTokens) {
            result.removeLast(-Int(nTokens))
        } else {
            result.removeAll()
        }
        let check = llama_token_to_piece(
            model,
            token,
            &result,
            Int32(result.count)
        )
        assert(check == nTokens)
    } else {
        result.removeLast(result.count - Int(nTokens))
    }
    if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
        return utfString
    } else {
        buffer.append(contentsOf: result)
        let data = Data(buffer.map { UInt8(bitPattern: $0) })
        if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
            buffer = []
        }
        guard let bufferString = String(data: data, encoding: .utf8) else {
            return nil
        }
        buffer = []
        return bufferString
    }
    return nil
 }
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
    ctx_params.seed  = 1234;
    ctx_params.n_ctx = n_kv_req;
    ctx_params.n_batch = std::max(n_len, n_parallel);
-    ctx_params.n_threads = params.n_threads;
+    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -9,7 +9,7 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
    exit 1
 fi
-MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
+MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
 PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
 USER_NAME="${USER_NAME:-User}"
 AI_NAME="${AI_NAME:-ChatLLaMa}"
@ -61,9 +61,9 @@ fi
 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
    echo 'Prompt cache does not exist, building...'
-    # Default batch_size to 8 here for better user feedback during initial prompt processing
+    # Default batch_size to 64 here for better user feedback during initial prompt processing
    ./main 2>>"$LOG" \
-        --batch_size 8 \
+        --batch_size 64 \
        "${OPTS[@]}" \
        --prompt-cache "$PROMPT_CACHE_FILE" \
        --file "$CUR_PROMPT_FILE" \
@ -132,7 +132,7 @@ while read -e line; do
    # HACK get num tokens from debug message
    # TODO get both messages in one go
    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
-        ! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
        echo >&2 "Couldn't get number of tokens from ./main output!"
        exit 1
    fi
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){
 llama_token sampling_id(struct MyModel* mymodel) {
    llama_context* ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
    llama_sampling_params & sparams = params.sampling_params;
    // int n_ctx = llama_n_ctx(ctx);
    // out of user input, sample next token
-    const float   temp            = params.temp;
+    const float   temp            = sparams.temp;
-    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k;
+    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
-    const float   top_p           = params.top_p;
+    const float   top_p           = sparams.top_p;
-    const float   tfs_z           = params.tfs_z;
+    const float   tfs_z           = sparams.tfs_z;
-    const float   typical_p       = params.typical_p;
+    const float   typical_p       = sparams.typical_p;
    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
    // const float   repeat_penalty  = params.repeat_penalty;
    // const float   alpha_presence  = params.presence_penalty;
    // const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = params.mirostat;
+    const int     mirostat        = sparams.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_tau    = sparams.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
+    const float   mirostat_eta    = sparams.mirostat_eta;
    // const bool    penalize_nl     = params.penalize_nl;
    llama_token id = 0;
@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
        // Apply params.logit_bias map
-        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
 ```
-The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
+The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@ -313,7 +313,7 @@ class ModelParams:
        gguf_writer.add_feed_forward_length(self.get_n_ff())
 def tensor_name(key, bid=None, suffix=".weight"):
-    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
+    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
 class Layer:
    def __init__(self, params, lora_params, bid):
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -332,8 +332,8 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        assert_shape_1d(layer.attention_norm, hparams.n_embd);
        assert_shape_2d(layer.wq,             hparams.n_embd, hparams.n_embd);
-        assert_shape_2d(layer.wk,             hparams.n_embd, hparams.n_embd);
+        assert_shape_2d(layer.wk,             hparams.n_embd, hparams.n_embd_gqa());
-        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd);
+        assert_shape_2d(layer.wv,             hparams.n_embd, hparams.n_embd_gqa());
        assert_shape_2d(layer.wo,             hparams.n_embd, hparams.n_embd);
        assert_shape_1d(layer.ffn_norm,       hparams.n_embd);
        assert_shape_2d(layer.w1,             hparams.n_embd, hparams.n_ff);
@ -529,13 +529,14 @@ static void init_lora(const struct my_llama_model * model, struct my_llama_lora
    set_param_lora(lora);
    // measure data size
-    struct ggml_allocr * alloc = NULL;
+    size_t size = 0;
-    alloc = ggml_allocr_new_measure(tensor_alignment);
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-    alloc_lora(alloc, lora);
+        size += GGML_PAD(ggml_nbytes(t), tensor_alignment);
    }
    // allocate data
-    lora->data.resize(ggml_allocr_max_size(alloc) + tensor_alignment);
+    struct ggml_allocr * alloc = NULL;
-    ggml_allocr_free(alloc);
+    lora->data.resize(size + tensor_alignment);
    alloc = ggml_allocr_new(lora->data.data(), lora->data.size(), tensor_alignment);
    alloc_lora(alloc, lora);
    ggml_allocr_free(alloc);
@ -626,7 +627,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    // KQ_pos - contains the positions
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    {
+    ggml_allocr_alloc(alloc, KQ_pos);
    if (!ggml_allocr_is_measure(alloc)) {
        int * data = (int *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
@ -786,6 +788,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
    ggml_allocr_alloc(alloc, t36->grad);
    // KQ_pos
    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));
    // make sure base model tensors data cannot be used in viewable operations
    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
@ -1711,11 +1715,9 @@ int main(int argc, char ** argv) {
    struct ggml_tensor * target_probs  = ggml_new_tensor_3d(ctx_input, GGML_TYPE_F32, n_vocab,  n_tokens, n_batch);
    // measure required memory for input tensors
-    alloc = ggml_allocr_new_measure(tensor_alignment);
+    size_t max_input_size = GGML_PAD(ggml_nbytes(tokens_input), tensor_alignment) +
-    ggml_allocr_alloc(alloc, tokens_input);
+                            GGML_PAD(ggml_nbytes(target_probs), tensor_alignment) +
-    ggml_allocr_alloc(alloc, target_probs);
+                            tensor_alignment;
    size_t max_input_size = ggml_allocr_max_size(alloc) + tensor_alignment;
    ggml_allocr_free(alloc);
    printf("%s: input_size = %zu bytes (%.1f MB)\n", __func__, max_input_size, (float) max_input_size / (1024.0f*1024.0f));
    // allocate input tensors
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@ -0,0 +1,8 @@
 set(TARGET infill)
 add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -0,0 +1,41 @@
 # llama.cpp/example/infill
 This example shows how to use the infill mode with Code Llama models supporting infill mode.
 Currently the 7B and 13B models support infill mode.
 Infill supports most of the options available in the main example.
 For further information have a look at the main README.md in llama.cpp/example/main/README.md
 ## Common Options
 In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 ## Input Prompts
 The `infill` program provides several ways to interact with the LLaMA models using input prompts:
 -   `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
 -   `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
 -   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
 ## Interaction
 The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
 ### Interaction Options
 -   `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
 -   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
 -   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
 ### Example
 ```bash
 ./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -0,0 +1,800 @@
 #include "common.h"
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
 #include "grammar-parser.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <vector>
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #define NOMINMAX
 #endif
 #include <windows.h>
 #include <signal.h>
 #endif
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
 static std::vector<llama_token> * g_input_tokens;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 static void write_logfile(
    const llama_context * ctx, const gpt_params & params, const llama_model * model,
    const std::vector<llama_token> & input_tokens, const std::string & output,
    const std::vector<llama_token> & output_tokens
 ) {
    if (params.logdir.empty()) {
        return;
    }
    const std::string timestamp = get_sortable_timestamp();
    const bool success = create_directory_with_parents(params.logdir);
    if (!success) {
        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: infill\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Generation Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    dump_string_yaml_multiline(logfile, "output", output.c_str());
    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
    llama_dump_timing_info_yaml(logfile, ctx);
    fclose(logfile);
 }
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
        if (!is_interacting) {
            is_interacting = true;
        } else {
            console::cleanup();
            printf("\n");
            llama_print_timings(*g_ctx);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
    }
 }
 #endif
 int main(int argc, char ** argv) {
    gpt_params params;
    llama_sampling_params & sparams = params.sampling_params;
    g_params = &params;
    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("infill", "log"));
    LOG_TEE("Log start\n");
    log_dump_cmdline(argc, argv);
 #endif // LOG_DISABLE_LOGS
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });
    if (params.logits_all) {
        printf("\n************\n");
        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (params.embedding) {
        printf("\n************\n");
        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (params.n_ctx != 0 && params.n_ctx < 8) {
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
    if (params.instruct) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for instruct mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.antiprompt.empty()) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
        printf("\n************\n");
        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (params.random_prompt) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.path_prompt_cache.empty()) {
        printf("\n************\n");
        printf("%s: infill does not support prompt caching\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (params.rope_freq_base != 0.0) {
        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
    }
    if (params.rope_freq_scale != 0.0) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }
    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init(params.numa);
    llama_model * model;
    llama_context * ctx;
    llama_context * ctx_guidance = NULL;
    g_model = &model;
    g_ctx = &ctx;
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (sparams.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
        ctx_guidance = llama_new_context_with_model(model, lparams);
    }
    if (model == NULL) {
        LOG_TEE("%s: error: unable to load model\n", __func__);
        return 1;
    }
    const int n_ctx_train = llama_n_ctx_train(model);
    const int n_ctx = llama_n_ctx(ctx);
    LOG("n_ctx: %d\n", n_ctx);
    if (n_ctx > n_ctx_train) {
        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, n_ctx);
    }
    // print system information
    {
        LOG_TEE("\n");
        LOG_TEE("%s\n", get_system_info(params).c_str());
    }
    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
    LOG("add_bos: %d\n", add_bos);
    bool suff_rm_leading_spc = params.escape;
    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
        params.input_suffix.erase(0, 1);
        suff_rm_leading_spc = false;
    }
    std::vector<llama_token> embd_inp;
    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
    const int space_token = 29871;
    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
        inp_sfx.erase(inp_sfx.begin());
    }
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
    if (add_bos) {
        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
    }
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
    embd_inp = inp_pfx;
    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
    embd_inp.push_back(llama_token_middle(ctx));
    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
    // Should not run without any tokens
    if (embd_inp.empty()) {
        embd_inp.push_back(llama_token_bos(ctx));
        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
    }
    // Tokenize negative prompt
    std::vector<llama_token> guidance_inp;
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }
    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }
    // number of tokens to keep when resetting context
    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
        params.n_keep = (int)embd_inp.size();
    }
    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
    // enable interactive mode if interactive start is specified
    if (params.interactive_first) {
        params.interactive = true;
    }
    if (params.verbose_prompt) {
        LOG_TEE("\n");
        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }
        if (ctx_guidance) {
            LOG_TEE("\n");
            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
            }
        }
        if (params.n_keep > 0) {
        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
            LOG_TEE("'\n");
        }
        LOG_TEE("\n");
    }
    if (params.interactive) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
        struct sigaction sigint_action;
        sigint_action.sa_handler = sigint_handler;
        sigemptyset (&sigint_action.sa_mask);
        sigint_action.sa_flags = 0;
        sigaction(SIGINT, &sigint_action, NULL);
 #elif defined (_WIN32)
        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
        LOG_TEE("%s: interactive mode on.\n", __func__);
        if (params.input_prefix_bos) {
            LOG_TEE("Input prefix with BOS\n");
        }
        if (!params.input_prefix.empty()) {
            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
        }
        if (!params.input_suffix.empty()) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");
    struct llama_grammar * grammar = NULL;
    grammar_parser::parse_state parsed_grammar;
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            return 1;
        }
        LOG_TEE("%s: grammar:\n", __func__);
        grammar_parser::print_grammar(stderr, parsed_grammar);
        LOG_TEE("\n");
        {
            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }
        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
        grammar = llama_grammar_init(
            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }
    // TODO: replace with ring-buffer
    std::vector<llama_token> last_tokens(n_ctx);
    std::fill(last_tokens.begin(), last_tokens.end(), 0);
    LOG_TEE("\n#####  Infill mode  #####\n\n");
    if (params.infill) {
        printf("\n************\n");
        printf("no need to specify '--infill', always running infill\n");
        printf("************\n\n");
    }
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
                              " - To return control without starting a new line, end your input with '/'.\n";
        } else {
            control_message = " - Press Return to return control to LLaMa.\n"
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
        LOG_TEE("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
 #endif
        LOG_TEE(       "%s\n", control_message);
        is_interacting = params.interactive_first;
    }
    bool input_echo           = true;
    int n_past             = 0;
    int n_remain           = params.n_predict;
    int n_consumed         = 0;
    int n_past_guidance    = 0;
    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
    std::ostringstream output_ss;     g_output_ss     = &output_ss;
    // the first thing we will do is to output the prompt, so set color accordingly
    console::set_display(console::prompt);
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;
    const int n_vocab = llama_n_vocab(model);
    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
            int max_embd_size = n_ctx - 4;
            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
            if ((int) embd.size() > max_embd_size) {
                const int skipped_tokens = (int) embd.size() - max_embd_size;
                embd.resize(max_embd_size);
                console::set_display(console::error);
                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console::set_display(console::reset);
                fflush(stdout);
            }
            // infinite text generation via context swapping
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                if (params.n_predict == -2) {
                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
                }
                const int n_left    = n_past - params.n_keep - 1;
                const int n_discard = n_left/2;
                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);
                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
                n_past -= n_discard;
                if (ctx_guidance) {
                    n_past_guidance -= n_discard;
                }
                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
            }
            // evaluate tokens in batches
            // embd is typically prepared beforehand to fit within a batch, but not always
            if (ctx_guidance) {
                int input_size = 0;
                llama_token * input_buf = NULL;
                if (n_past_guidance < (int) guidance_inp.size()) {
                    // Guidance context should have the same data with these modifications:
                    //
                    // * Replace the initial prompt
                    // * Shift everything by guidance_offset
                    embd_guidance = guidance_inp;
                    if (embd.begin() + original_prompt_len < embd.end()) {
                        embd_guidance.insert(
                            embd_guidance.end(),
                            embd.begin() + original_prompt_len,
                            embd.end()
                        );
                    }
                    input_buf  = embd_guidance.data();
                    input_size = embd_guidance.size();
                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
                } else {
                    input_buf  = embd.data();
                    input_size = embd.size();
                }
                for (int i = 0; i < input_size; i += params.n_batch) {
                    int n_eval = std::min(input_size - i, params.n_batch);
                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
                        LOG_TEE("%s : failed to eval\n", __func__);
                        return 1;
                    }
                    n_past_guidance += n_eval;
                }
            }
            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                int n_eval = (int) embd.size() - i;
                if (n_eval > params.n_batch) {
                    n_eval = params.n_batch;
                }
                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
                    LOG_TEE("%s : failed to eval\n", __func__);
                    return 1;
                }
                n_past += n_eval;
                LOG("n_past = %d\n", n_past);
            }
        }
        embd.clear();
        embd_guidance.clear();
        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
            last_tokens.erase(last_tokens.begin());
            last_tokens.push_back(id);
            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
            embd.push_back(id);
            // echo this to console
            input_echo = true;
            // decrement remaining sampling budget
            --n_remain;
            LOG("n_remain: %d\n", n_remain);
        } else {
            // some user input remains from prompt or interaction, forward it to processing
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
                last_tokens.erase(last_tokens.begin());
                last_tokens.push_back(embd_inp[n_consumed]);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
                }
            }
        }
        // display text
        if (input_echo) {
            for (auto id : embd) {
                const std::string token_str = llama_token_to_piece(ctx, id);
                printf("%s", token_str.c_str());
                if (embd.size() > 1) {
                    input_tokens.push_back(id);
                } else {
                    output_tokens.push_back(id);
                    output_ss << token_str;
                }
            }
            fflush(stdout);
        }
        // reset color to default if we there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
        }
        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
            // deal with eot token in infill mode
            if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
                if(is_interacting && !params.interactive_first) {
                    // print an eot token
                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
                }
                fflush(stdout);
                printf("\n");
                console::set_display(console::user_input);
                std::string buffer;
                std::string line;
                bool another_line=true;
                // set a new prefix via stdin
                do {
                    another_line = console::readline(line, params.multiline_input);
                    buffer += line;
                } while (another_line);
                // check if we got an empty line, if so we use the old input
                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                    params.input_prefix = buffer;
                }
                buffer.clear();
                // set a new suffix via stdin
                do {
                    another_line = console::readline(line, params.multiline_input);
                    buffer += line;
                } while (another_line);
                // check if we got an empty line
                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
                    params.input_suffix = buffer;
                }
                buffer.clear();
                // done taking input, reset color
                console::set_display(console::reset);
                if (params.escape) {
                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
                    process_escapes(params.input_prefix);
                    process_escapes(params.input_suffix);
                }
                suff_rm_leading_spc = params.escape;
                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
                    params.input_suffix.erase(0, 1);
                    suff_rm_leading_spc = false;
                }
                // tokenize new prefix and suffix
                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
                    inp_sfx.erase(inp_sfx.begin());
                }
                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
                if (add_bos) {
                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
                }
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
                embd_inp = inp_pfx;
                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                embd_inp.push_back(llama_token_middle(ctx));
                embd.clear();
                embd_guidance.clear();
                n_remain = params.n_predict;
                n_past = 0;
                n_consumed = 0;
                // LOG_TEE("took new input\n");
                is_interacting = false;
            }
            // deal with end of text token in interactive mode
            else if (last_tokens.back() == llama_token_eos(ctx)) {
                LOG("found EOS token\n");
                if (params.interactive) {
                    is_interacting = true;
                    printf("\n");
                    console::set_display(console::user_input);
                    fflush(stdout);
               }
            }
            if (n_past > 0 && is_interacting && !params.interactive) {
                LOG("waiting for user input\n");
                if (params.input_prefix_bos) {
                    LOG("adding input prefix BOS token\n");
                    embd_inp.push_back(llama_token_bos(ctx));
                }
                std::string buffer;
                if (!params.input_prefix.empty()) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    buffer += params.input_prefix;
                    printf("%s", buffer.c_str());
                }
                std::string line;
                bool another_line = true;
                do {
                    another_line = console::readline(line, params.multiline_input);
                    buffer += line;
                } while (another_line);
                // done taking input, reset color
                console::set_display(console::reset);
                // Add tokens to embd only if the input buffer is non-empty
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        buffer += params.input_suffix;
                        printf("%s", params.input_suffix.c_str());
                    }
                    LOG("buffer: '%s'\n", buffer.c_str());
                    const size_t original_size = embd_inp.size();
                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
                        const llama_token token = embd_inp[i];
                        output_tokens.push_back(token);
                        output_ss << llama_token_to_piece(ctx, token);
                    }
                    n_remain -= line_inp.size();
                    LOG("n_remain: %d\n", n_remain);
                } else {
                    LOG("empty line, passing control back\n");
                }
                input_echo = false; // do not echo this again
            }
            if (n_past > 0) {
                if (is_interacting) {
                    // reset grammar state if we're restarting generation
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);
                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
                    }
                }
                is_interacting = false;
            }
        }
        // end of text token
        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
            break;
        }
        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
            n_remain = params.n_predict;
            is_interacting = true;
        }
    }
    if (!params.interactive && n_remain <= 0) {
        printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
        fflush(stdout);
    }
    llama_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);
    if (grammar != NULL) {
        llama_grammar_free(grammar);
    }
    llama_backend_free();
 #ifndef LOG_DISABLE_LOGS
    LOG_TEE("Log end\n");
 #endif // LOG_DISABLE_LOGS
    return 0;
 }
--- a/examples/jeopardy/README.md
+++ b/examples/jeopardy/README.md
@ -2,7 +2,7 @@
 This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
-The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
+The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
 Step 1: Open jeopardy.sh and modify the following:
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -0,0 +1,20 @@
 set(TARGET clip)
 add_library(${TARGET} clip.cpp clip.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
    endif()
 if(TARGET BUILD_INFO)
    add_dependencies(${TARGET} BUILD_INFO)
 endif()
 set(TARGET llava)
 add_executable(${TARGET} llava.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
    add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -0,0 +1,57 @@
 # LLaVA
 Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
 The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
 and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
 models are available.
 After API is confirmed, more models will be supported / uploaded.
 ## Usage
 Build with cmake or run `make llava` to build it.
 After building, run: `./llava` to see the usage. For example:
 ```sh
 ./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
 ## Model conversion
 - Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
 ```sh
 git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 ```sh
 python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 ```
 3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
 python ./convert.py ../llava-v1.5-7b
 ```
 Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
 ## TODO
 - [ ] Support server mode.
 - [ ] Support non-CPU backend for the image encoding part.
 - [ ] Support different sampling methods.
 - [ ] Support more model variants.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -0,0 +1,73 @@
 #ifndef CLIP_H
 #define CLIP_H
 #include "ggml.h"
 struct clip_ctx;
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct clip_vision_hparams {
    int32_t image_size;
    int32_t patch_size;
    int32_t hidden_size;
    int32_t n_intermediate;
    int32_t projection_dim;
    int32_t n_head;
    int32_t n_layer;
    float eps;
 };
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
 void clip_free(struct clip_ctx * ctx);
 size_t clip_embd_nbytes(struct clip_ctx * ctx);
 int clip_n_patches(struct clip_ctx * ctx);
 int clip_n_mmproj_embd(struct clip_ctx * ctx);
 // RGB uint8 image
 struct clip_image_u8 {
    int nx;
    int ny;
    uint8_t * data;
    size_t size;
 };
 // RGB float32 image (NHWC)
 // Memory layout: RGBRGBRGB...
 struct clip_image_f32 {
    int nx;
    int ny;
    float * data;
    size_t size;
 };
 struct clip_image_u8_batch {
    struct clip_image_u8 * data;
    size_t size;
 };
 struct clip_image_f32_batch {
    struct clip_image_f32 * data;
    size_t size;
 };
 struct clip_image_u8 * make_clip_image_u8();
 struct clip_image_f32 * make_clip_image_f32();
 bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
 bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
 bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
                             float * vec);
 bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
 #ifdef __cplusplus
 }
 #endif
 #endif // CLIP_H
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -0,0 +1,250 @@
 import argparse
 import os
 import json
 import torch
 import numpy as np
 from gguf import *
 from transformers import CLIPModel, CLIPProcessor
 TEXT = "clip.text"
 VISION = "clip.vision"
 def k(raw_key: str, arch: str) -> str:
    return raw_key.format(arch=arch)
 def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
    if name in (
        "logit_scale",
        "text_model.embeddings.position_ids",
        "vision_model.embeddings.position_ids",
    ):
        return True
    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
        return True
    if name.startswith("v") and not has_vision:
        return True
    if name.startswith("t") and not has_text:
        return True
    return False
 def get_tensor_name(name: str) -> str:
    if "projection" in name:
        return name
    if "mm_projector" in name:
        return name.replace("model.mm_projector", "mm")
    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1))
        + list(range(ord("¡"), ord("¬") + 1))
        + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
 ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
 ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
 ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
 ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 args = ap.parse_args()
 if args.text_only and args.vision_only:
    print("--text-only and --image-only arguments cannot be specified at the same time.")
    exit(1)
 if args.use_f32:
    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir
 with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
    vocab = json.load(f)
    tokens = [key for key in vocab]
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    config = json.load(f)
    v_hparams = config["vision_config"]
    t_hparams = config["text_config"]
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if args.use_f32:
    ftype = 0
 model = CLIPModel.from_pretrained(dir_model)
 processor = CLIPProcessor.from_pretrained(dir_model)
 fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_llava_projector = False
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
 elif args.vision_only:
    fname_middle = "vision-"
    has_text_encoder = False
 elif args.llava_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_llava_projector = True
 else:
    fname_middle = ""
 output_dir = args.output_dir if args.output_dir is not None else dir_model
 os.makedirs(output_dir, exist_ok=True)
 output_prefix = os.path.basename(output_dir).replace("ggml_", "")
 fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
 fout = GGUFWriter(path=fname_out, arch="clip")
 fout.add_bool("clip.has_text_encoder", has_text_encoder)
 fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
 fout.add_bool("clip.has_llava_projector", has_llava_projector)
 fout.add_file_type(ftype)
 model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
 fout.add_name(model_name)
 if args.text_only:
    fout.add_description("text-only CLIP model")
 elif args.vision_only and not has_llava_projector:
    fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
    fout.add_description("image encoder for LLaVA")
 else:
    fout.add_description("two-tower CLIP model")
 if has_text_encoder:
    # text_model hparams
    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
    fout.add_token_list(tokens)
 if has_vision_encoder:
    # vision_model hparams
    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
    fout.add_array("clip.vision.image_mean", image_mean)
    fout.add_array("clip.vision.image_std", image_std)
 use_gelu = v_hparams["hidden_act"] == "gelu"
 fout.add_bool("clip.use_gelu", use_gelu)
 if has_llava_projector:
    model.vision_model.encoder.layers.pop(-1)
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
        if data.ndim == 2:
            data = data.squeeze().numpy().astype(np.float16)
        else:
            data = data.squeeze().numpy().astype(np.float32)
        fout.add_tensor(name, data)
    print("Projector tensors added\n")
 state_dict = model.state_dict()
 for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
        # we don't need this
        print(f"skipping parameter: {name}")
        continue
    name = get_tensor_name(name)
    data = data.squeeze().numpy()
    n_dims = len(data.shape)
    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype_cur = 0
    if n_dims == 4:
        print(f"tensor {name} is always saved in f16")
        data = data.astype(np.float16)
        ftype_cur = 1
    elif ftype == 1:
        if name[-7:] == ".weight" and n_dims == 2:
            print("  Converting to float16")
            data = data.astype(np.float16)
            ftype_cur = 1
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    else:
        if data.dtype != np.float32:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
    fout.add_tensor(name, data)
 fout.write_header_to_file()
 fout.write_kv_data_to_file()
 fout.write_tensors_to_file()
 fout.close()
 print("Done. Output file: " + fname_out)
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@ -0,0 +1,30 @@
 import argparse
 import glob
 import os
 import torch
 ap = argparse.ArgumentParser()
 ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
 args = ap.parse_args()
 # find the model part that includes the the multimodal projector weights
 path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
 checkpoint = torch.load(path)
 # get a list of mm tensor names
 mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
 # store these tensors in a new dictionary and torch.save them
 projector = {name: checkpoint[name] for name in mm_tensors}
 torch.save(projector, f"{args.model}/llava.projector")
 # remove these tensors from the checkpoint and save it again
 for name in mm_tensors:
    del checkpoint[name]
 torch.save(checkpoint, path)
 print("Done!")
 print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@ -0,0 +1,145 @@
 #pragma once
 // this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
 #include "common.h"
 #include "llama.h"
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
 inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
    for (int i = 0; i < N; i += n_batch) {
        int n_eval = N - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
    }
    return true;
 }
 inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
    int N = (int) tokens.size();
    for (int i = 0; i < N; i += n_batch) {
        int n_eval = (int) tokens.size() - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
    }
    return true;
 }
 inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
    std::vector<llama_token> tokens;
    tokens.push_back(id);
    return eval_tokens(ctx_llama, tokens, 1, n_past);
 }
 inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past){
    std::string              str2     = str;
    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, true);
    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
    return true;
 }
 // TODO: use common/sampling.h
 inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
      // out of user input, sample next token
    const float   temp      = params.sampling_params.temp;
    const int32_t top_k     = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
    const float   top_p     = params.sampling_params.top_p;
    const float   tfs_z     = params.sampling_params.tfs_z;
    const float   typical_p = params.sampling_params.typical_p;
      // const int32_t repeat_last_n   = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
      // const float   repeat_penalty  = params.sampling_params.repeat_penalty;
      // const float   alpha_presence  = params.sampling_params.presence_penalty;
      // const float   alpha_frequency = params.sampling_params.frequency_penalty;
    const int     mirostat     = params.sampling_params.mirostat;
    const float   mirostat_tau = params.sampling_params.mirostat_tau;
    const float   mirostat_eta = params.sampling_params.mirostat_eta;
      // const bool    penalize_nl     = params.sampling_params.penalize_nl;
    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx_llama);
        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
          // Apply params.logit_bias map
        for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
          // TODO: Apply penalties
          // float nl_logit = logits[llama_token_nl(ctx)];
          // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
          // llama_sample_repetition_penalty(ctx, &candidates_p,
          //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
          //      last_n_repeat, repeat_penalty);
          // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
          // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
          // last_n_repeat, alpha_frequency, alpha_presence);
          // if (!penalize_nl) {
          //     logits[llama_token_nl(ctx)] = nl_logit;
          // }
        if (temp <= 0) {
              // Greedy sampling
            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
        } else {
            if (mirostat == 1) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                const  int mirostat_m    = 100;
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
            } else if (mirostat == 2) {
                static float mirostat_mu = 2.0f * mirostat_tau;
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
            } else {
                  // Temperature sampling
                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                llama_sample_temp(ctx_llama, &candidates_p, temp);
                id = llama_sample_token(ctx_llama, &candidates_p);
            }
        }
    }
    return id;
 }
 inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
    int id = sample_id(ctx_llama, params);
    static std::string ret;
    if (id == llama_token_eos(ctx_llama)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
    }
    eval_id(ctx_llama, id, n_past);
    return ret.c_str();
 }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -0,0 +1,156 @@
 #include "clip.h"
 #include "llava-utils.h"
 #include "common.h"
 #include "llama.h"
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
 static void show_additional_info(int /*argc*/, char ** argv) {
    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 int main(int argc, char ** argv) {
    ggml_time_init();
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        show_additional_info(argc, argv);
        return 1;
    }
    if (params.mmproj.empty() || params.image.empty()) {
        gpt_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
    const char * clip_path = params.mmproj.c_str();
    const char * img_path = params.image.c_str();
    if (params.prompt.empty()) {
        params.prompt = "describe the image in detail.";
    }
    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
    // load and preprocess the image
    clip_image_u8 img;
    clip_image_f32 img_res;
    if (!clip_image_load_from_file(img_path, &img)) {
        fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
        clip_free(ctx_clip);
        return 1;
    }
    if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
        fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
        clip_free(ctx_clip);
        return 1;
    }
    int n_img_pos  = clip_n_patches(ctx_clip);
    int n_img_embd = clip_n_mmproj_embd(ctx_clip);
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
    if (!image_embd) {
        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
        return 1;
    }
    const int64_t t_img_enc_start_us = ggml_time_us();
    if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
        fprintf(stderr, "Unable to encode image\n");
        return 1;
    }
    const int64_t t_img_enc_end_us = ggml_time_us();
    // we get the embeddings, free up the memory required for CLIP
    clip_free(ctx_clip);
    llama_backend_init(params.numa);
    llama_model_params model_params = llama_model_default_params();
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
    if (ctx_llama == NULL) {
        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return 1;
    }
    // make sure that the correct mmproj was used, i.e., compare apples to apples
    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
    if (n_img_embd != n_llama_embd) {
        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
        llama_free(ctx_llama);
        llama_free_model(model);
        llama_backend_free();
        free(image_embd);
        return 1;
    }
    // process the prompt
    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
    int n_past = 0;
    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
    // GG: are we sure that the should be a trailing whitespace at the end of this string?
    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params.n_batch, &n_past);
    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
    eval_string(ctx_llama, params.prompt.c_str(), params.n_batch, &n_past);
    eval_string(ctx_llama, "\nASSISTANT:",        params.n_batch, &n_past);
    // generate the response
    printf("\n");
    for (int i = 0; i < max_tgt_len; i++) {
        const char * tmp = sample(ctx_llama, params, &n_past);
        if (strcmp(tmp, "</s>") == 0) break;
        printf("%s", tmp);
        fflush(stdout);
    }
    printf("\n");
    {
        const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
        printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
    }
    llama_print_timings(ctx_llama);
    llama_free(ctx_llama);
    llama_free_model(model);
    llama_backend_free();
    free(image_embd);
    return 0;
 }
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@ -28,6 +28,16 @@ configure_file(${_common_path}/../build-info.h
 target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
    ${CMAKE_CURRENT_BINARY_DIR})
 # If the common project was part of "main-cmake-pkg" the transient
 # defines would automatically be attached. Because the common func-
 # tionality is separate, but dependent upon the defines, it must be
 # explicitly extracted from the "llama" target.
 #
 get_target_property(_llama_transient_defines llama
    INTERFACE_COMPILE_DEFINITIONS)
 target_compile_definitions(common PRIVATE "${_llama_transient_defines}")
 add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
 target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -109,6 +109,7 @@ int main(int argc, char ** argv) {
    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }
    llama_sampling_params & sparams = params.sampling_params;
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
@ -179,7 +180,7 @@ int main(int argc, char ** argv) {
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (params.cfg_scale > 1.f) {
+    if (sparams.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
        ctx_guidance = llama_new_context_with_model(model, lparams);
    }
@ -257,9 +258,9 @@ int main(int argc, char ** argv) {
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
@ -296,6 +297,9 @@ int main(int argc, char ** argv) {
            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        }
        // remove any "future" tokens that we might have inherited from the previous session
        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
    }
    LOGLN(
@ -343,7 +347,7 @@ int main(int argc, char ** argv) {
        if (ctx_guidance) {
            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@ -395,7 +399,7 @@ int main(int argc, char ** argv) {
        }
    }
    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");
@ -413,8 +417,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("\n");
        {
-            auto it = params.logit_bias.find(llama_token_eos(ctx));
+            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
-            if (it != params.logit_bias.end() && it->second == -INFINITY) {
+            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }
@ -469,6 +473,7 @@ int main(int argc, char ** argv) {
    const int n_vocab = llama_n_vocab(model);
    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
@ -622,7 +627,7 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }
-            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
+            const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
            last_tokens.erase(last_tokens.begin());
            last_tokens.push_back(id);
@ -667,7 +672,7 @@ int main(int argc, char ** argv) {
            }
            fflush(stdout);
        }
-        // reset color to default if we there is no pending user input
+        // reset color to default if there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
        }
@ -694,10 +699,8 @@ int main(int argc, char ** argv) {
                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
                        if (params.interactive) {
                            is_interacting = true;
                            console::set_display(console::user_input);
                        }
                        is_antiprompt = true;
                        fflush(stdout);
                        break;
                    }
                }
@ -721,8 +724,6 @@ int main(int argc, char ** argv) {
                    is_interacting = true;
                    printf("\n");
                    console::set_display(console::user_input);
                    fflush(stdout);
                } else if (params.instruct) {
                    is_interacting = true;
                }
@ -747,6 +748,9 @@ int main(int argc, char ** argv) {
                    printf("%s", buffer.c_str());
                }
                // color user input only
                console::set_display(console::user_input);
                std::string line;
                bool another_line = true;
                do {
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -10,6 +10,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <ctime>
 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
@ -70,6 +71,26 @@ struct client {
    std::vector<llama_token> tokens_prev;
 };
 static void print_date_time() {
    std::time_t current_time = std::time(nullptr);
    std::tm* local_time = std::localtime(&current_time);
    char buffer[80];
    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
 }
 // Define a split string function to ...
 static std::vector<std::string> split_string(const std::string& input, char delimiter) {
    std::vector<std::string> tokens;
    std::istringstream stream(input);
    std::string token;
    while (std::getline(stream, token, delimiter)) {
        tokens.push_back(token);
    }
    return tokens;
 }
 int main(int argc, char ** argv) {
    srand(1234);
@ -104,6 +125,25 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
    // load the prompts from an external file if there are any
    if (params.prompt.empty()) {
        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
    } else {
        // Output each line of the input params.prompts vector and copy to k_prompts
        int index = 0;
        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
        std::vector<std::string> prompts = split_string(params.prompt, '\n');
        for (const auto& prompt : prompts) {
            k_prompts.resize(index + 1);
            k_prompts[index] = prompt;
            index++;
            printf("%3d prompt: %s\n", index, prompt.c_str());
        }
    }
    fprintf(stderr, "\n\n");
    fflush(stderr);
@ -129,7 +169,7 @@ int main(int argc, char ** argv) {
    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(params.n_ctx, 0);
+    llama_batch batch = llama_batch_init(n_ctx, 0);
    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
@ -233,7 +273,7 @@ int main(int argc, char ** argv) {
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;
-                    LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
                    g_seq_id += 1;
@ -301,7 +341,7 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
-                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);
+                const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@ -332,12 +372,12 @@ int main(int argc, char ** argv) {
                    }
                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);
+                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
                    const auto t_main_end = ggml_time_us();
-                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\nResponse: %s\n\n",
+                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
-                            client.id, client.seq_id, client.n_prompt, client.n_decoded,
+                            client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                            (t_main_end - client.t_start_prompt) / 1e6,
                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
                            n_cache_miss,
@ -346,7 +386,7 @@ int main(int argc, char ** argv) {
                    n_total_prompt += client.n_prompt;
                    n_total_gen    += client.n_decoded;
-
+                    llama_sampling_context_reset(ctx_sampling, client.seq_id);
                    client.seq_id = -1;
                }
@ -357,13 +397,21 @@ int main(int argc, char ** argv) {
    const auto t_main_end = ggml_time_us();
-    LOG_TEE("\n\n");
+    print_date_time();
    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
    if (params.prompt_file.empty()) {
        params.prompt_file = "used built-in defaults";
    }
    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);
-    LOG_TEE("\n\n");
+    LOG_TEE("\n");
    llama_print_timings(ctx);
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -8,9 +8,10 @@
 int main(int argc, char ** argv) {
    gpt_params params;
    llama_sampling_params & sparams = params.sampling_params;
    params.seed = 42;
    params.n_threads = 4;
-    params.repeat_last_n = 64;
+    sparams.repeat_last_n = 64;
    params.prompt = "The quick brown fox";
    if (!gpt_params_parse(argc, argv, params)) {
@ -24,7 +25,7 @@ int main(int argc, char ** argv) {
    }
    auto n_past = 0;
-    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
+    auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
    // init
    llama_model * model;
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -114,9 +114,9 @@ node index.js
    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).
-    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
-    `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
+    `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
    `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
@ -156,6 +156,8 @@ node index.js
    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
 -   **POST** `/tokenize`: Tokenize a given text.
    *Options:*
@ -176,6 +178,16 @@ node index.js
    `content`: Set the text to process.
    **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
    *Options:*
    `input_prefix`: Set the prefix of the code to infill.
    `input_suffix`: Set the suffix of the code to infill.
    It also accepts all the options of `/completion` except `stream` and `prompt`.
 ## More examples
 ### Interactive mode
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@ -27,10 +27,10 @@ def is_present(json, key):
        buf = json[key]
    except KeyError:
        return False
    if json[key] == None:
        return False
    return True
 #convert chat to prompt
 def convert_chat(messages):
    prompt = "" + args.chat_prompt.replace("\\n", "\n")
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -136,6 +136,11 @@
      display: block;
    }
    fieldset label.slim {
      margin: 0 0.5em;
      display: inline;
    }
    header, footer {
      text-align: center;
    }
@ -145,6 +150,14 @@
      color: #888;
    }
    .mode-chat textarea[name=prompt] {
      height: 4.5em;
    }
    .mode-completion textarea[name=prompt] {
      height: 10em;
    }
    @keyframes loading-bg-wipe {
      0% {
@ -187,7 +200,7 @@
      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
      historyTemplate: "{{name}}: {{message}}",
      transcript: [],
-      type: "chat",
+      type: "chat",  // "chat" | "completion"
      char: "Llama",
      user: "User",
    })
@ -365,13 +378,44 @@
      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
    }
    async function runLlama(prompt, llamaParams, char) {
      const currentMessages = [];
      const history = session.value.transcript;
      if (controller.value) {
        throw new Error("already running");
      }
      controller.value = new AbortController();
      for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
        const data = chunk.data;
        if (data.stop) {
          while (
            currentMessages.length > 0 &&
            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
            ) {
            currentMessages.pop();
          }
          transcriptUpdate([...history, [char, currentMessages]])
          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
        } else {
          currentMessages.push(data);
          transcriptUpdate([...history, [char, currentMessages]])
        }
        if (data.timings) {
          llamaStats.value = data.timings;
        }
      }
      controller.value = null;
    }
    // send message to server
    const chat = async (msg) => {
      if (controller.value) {
        console.log('already running...');
        return;
      }
      controller.value = new AbortController();
      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
@ -391,55 +435,41 @@
        ).join("\n"),
      });
-      const currentMessages = [];
+      await runLlama(prompt, {
      const history = session.value.transcript
      const llamaParams = {
        ...params.value,
        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
      }, "{{char}}");
    }
    const runCompletion = async () => {
      if (controller.value) {
        console.log('already running...');
        return;
      }
      const {prompt} = session.value;
      transcriptUpdate([...session.value.transcript, ["", prompt]]);
      await runLlama(prompt, {
        ...params.value,
        stop: [],
      }, "");
    }
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
+    const stop = (e) => {
-        const data = chunk.data;
+      e.preventDefault();
-
+      if (controller.value) {
-        if (data.stop) {
+        controller.value.abort();
-          while (
+        controller.value = null;
            currentMessages.length > 0 &&
            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
          ) {
            currentMessages.pop();
          }
          transcriptUpdate([...history, ["{{char}}", currentMessages]])
          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
        } else {
          currentMessages.push(data);
          transcriptUpdate([...history, ["{{char}}", currentMessages]])
        }
        if (data.timings) {
          llamaStats.value = data.timings;
        }
      }
    }
-      controller.value = null;
+    const reset = (e) => {
      stop(e);
      transcriptUpdate([]);
    }
    function MessageInput() {
      const message = useSignal("")
      const stop = (e) => {
        e.preventDefault();
        if (controller.value) {
          controller.value.abort();
          controller.value = null;
        }
      }
      const reset = (e) => {
        stop(e);
        transcriptUpdate([]);
      }
      const submit = (e) => {
        stop(e);
        chat(message.value);
@ -474,6 +504,19 @@
      `
    }
    function CompletionControls() {
      const submit = (e) => {
        stop(e);
        runCompletion();
      }
      return html`
        <div>
          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
          <button onclick=${reset}>Reset</button>
        </div>`;
    }
    const ChatLog = (props) => {
      const messages = session.value.transcript;
      const container = useRef(null)
@ -497,7 +540,11 @@
            data;
          message = html`<${Markdownish} text=${template(text)} />`
        }
-        return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
+        if(user) {
          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
        } else {
          return html`<p key=${index}>${message}</p>`
        }
      };
      return html`
@ -574,18 +621,31 @@
        userTemplateAutosave()
      }, [session.value, params.value])
-      return html`
+      const GrammarControl = () => (
-        <form>
+        html`
-          <fieldset>
+          <div>
-            <${UserTemplateResetButton}/>
+            <label for="template">Grammar</label>
-          </fieldset>
+            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
          </div>
          `
      );
-          <fieldset>
+      const PromptControlFieldSet = () => (
-            <div>
+        html`
-              <label for="prompt">Prompt</label>
+        <fieldset>
-              <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
+          <div>
-            </div>
+            <label htmlFor="prompt">Prompt</label>
-          </fieldset>
+            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
          </div>
        </fieldset>
        `
      );
      const ChatConfigForm = () => (
        html`
          ${PromptControlFieldSet()}
          <fieldset class="two">
            <div>
@ -609,15 +669,30 @@
              <label for="template">Chat history template</label>
              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
            </div>
            ${GrammarControl()}
          </fieldset>
      `
    );
      const CompletionConfigForm = () => (
        html`
          ${PromptControlFieldSet()}
          <fieldset>${GrammarControl()}</fieldset>
        `
      );
      return html`
        <form>
          <fieldset class="two">
            <${UserTemplateResetButton}/>
            <div>
-              <label for="template">Grammar</label>
+              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
-              <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
              <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
              <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
            </div>
          </fieldset>
          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
          <fieldset class="two">
            ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
            ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
@ -851,7 +926,7 @@
    function App(props) {
      return html`
-        <div>
+        <div class="mode-${session.value.type}">
          <header>
            <h1>llama.cpp</h1>
          </header>
@ -861,7 +936,7 @@
          </main>
          <section id="write">
-            <${MessageInput} />
+            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
          </section>
          <footer>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -200,6 +200,7 @@ struct llama_server_context
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    gpt_params params;
    llama_sampling_context ctx_sampling;
    int n_ctx;
    grammar_parser::parse_state parsed_grammar;
@ -254,6 +255,7 @@ struct llama_server_context
        if (grammar != nullptr) {
            llama_grammar_free(grammar);
            grammar = nullptr;
            ctx_sampling = llama_sampling_context_init(params, NULL);
        }
    }
@ -329,8 +331,8 @@ struct llama_server_context
            grammar_parser::print_grammar(stderr, parsed_grammar);
            {
-                auto it = params.logit_bias.find(llama_token_eos(ctx));
+                auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
-                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
                }
            }
@ -339,9 +341,89 @@ struct llama_server_context
            grammar = llama_grammar_init(
                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
        }
        ctx_sampling = llama_sampling_context_init(params, grammar);
        return true;
    }
    void loadInfill()
    {
        bool suff_rm_leading_spc = true;
        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
            params.input_suffix.erase(0, 1);
            suff_rm_leading_spc = false;
        }
        auto prefix_tokens = tokenize(params.input_prefix, false);
        auto suffix_tokens = tokenize(params.input_suffix, false);
        const int space_token = 29871;
        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
            suffix_tokens.erase(suffix_tokens.begin());
        }
        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
        prefix_tokens.push_back(llama_token_middle(ctx));
        auto prompt_tokens = prefix_tokens;
        num_prompt_tokens = prompt_tokens.size();
        if (params.n_keep < 0)
        {
            params.n_keep = (int)num_prompt_tokens;
        }
        params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
        // if input prompt is too big, truncate like normal
        if (num_prompt_tokens >= (size_t)params.n_ctx)
        {
            printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
            // todo we probably want to cut from both sides
            const int n_left = (params.n_ctx - params.n_keep) / 2;
            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
            LOG_VERBOSE("input truncated", {
                                               {"n_ctx", params.n_ctx},
                                               {"n_keep", params.n_keep},
                                               {"n_left", n_left},
                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
                                           });
            truncated = true;
            prompt_tokens = new_tokens;
        }
        else
        {
            const size_t ps = num_prompt_tokens;
            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
        }
        // compare the evaluated prompt with the new prompt
        n_past = common_part(embd, prompt_tokens);
        embd = prompt_tokens;
        if (n_past == num_prompt_tokens)
        {
            // we have to evaluate at least 1 token to generate logits.
            printf("we have to evaluate at least 1 token to generate logits\n");
            n_past--;
        }
        // since #3228 we now have to manually manage the KV cache
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
        LOG_VERBOSE("prompt ingested", {
                                           {"n_past", n_past},
                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
                                           {"to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
                                       });
        has_next_token = true;
    }
    void loadPrompt()
    {
        auto prompt_tokens = tokenize(prompt, true);  // always add BOS
@ -383,9 +465,6 @@ struct llama_server_context
        // compare the evaluated prompt with the new prompt
        n_past = common_part(embd, prompt_tokens);
        // since #3228 we now have to manually manage the KV cache
        llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
        embd = prompt_tokens;
        if (n_past == num_prompt_tokens)
        {
@ -393,6 +472,9 @@ struct llama_server_context
            n_past--;
        }
        // since #3228 we now have to manually manage the KV cache
        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
        LOG_VERBOSE("prompt ingested", {
                                           {"n_past", n_past},
                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@ -440,9 +522,11 @@ struct llama_server_context
                                           });
        }
        bool tg = true;
        while (n_past < embd.size())
        {
            int n_eval = (int)embd.size() - n_past;
            tg = n_eval == 1;
            if (n_eval > params.n_batch)
            {
                n_eval = params.n_batch;
@ -468,98 +552,20 @@ struct llama_server_context
            return result;
        }
        // out of user input, sample next token
        const float temp = params.temp;
        const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
        const float top_p = params.top_p;
        const float tfs_z = params.tfs_z;
        const float typical_p = params.typical_p;
        const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
        const float repeat_penalty = params.repeat_penalty;
        const float alpha_presence = params.presence_penalty;
        const float alpha_frequency = params.frequency_penalty;
        const int mirostat = params.mirostat;
        const float mirostat_tau = params.mirostat_tau;
        const float mirostat_eta = params.mirostat_eta;
        const bool penalize_nl = params.penalize_nl;
        const int32_t n_probs = params.n_probs;
        {
-            auto *logits = llama_get_logits(ctx);
+            // out of user input, sample next token
            auto n_vocab = llama_n_vocab(model);
            // Apply params.logit_bias map
            for (const auto &it : params.logit_bias)
            {
                logits[it.first] += it.second;
            }
            std::vector<llama_token_data> candidates;
-            candidates.reserve(n_vocab);
+            candidates.reserve(llama_n_vocab(model));
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++)
+
            result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
            const int32_t n_probs = params.sampling_params.n_probs;
            if (params.sampling_params.temp <= 0 && n_probs > 0)
            {
-                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                // For llama_sample_token_greedy we need to sort candidates
-            }
+                llama_sample_softmax(ctx, &candidates_p);
            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
            // Apply penalties
            float nl_logit = logits[llama_token_nl(ctx)];
            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
            llama_sample_repetition_penalty(ctx, &candidates_p,
                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                                            last_n_repeat, repeat_penalty);
            llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
                                                          last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                                                          last_n_repeat, alpha_frequency, alpha_presence);
            if (!penalize_nl)
            {
                logits[llama_token_nl(ctx)] = nl_logit;
            }
            if (grammar != nullptr) {
                llama_sample_grammar(ctx, &candidates_p, grammar);
            }
            if (temp <= 0)
            {
                // Greedy sampling
                result.tok = llama_sample_token_greedy(ctx, &candidates_p);
                if (n_probs > 0)
                {
                    llama_sample_softmax(ctx, &candidates_p);
                }
            }
            else
            {
                if (mirostat == 1)
                {
                    static float mirostat_mu = 2.0f * mirostat_tau;
                    const int mirostat_m = 100;
                    llama_sample_temp(ctx, &candidates_p, temp);
                    result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
                }
                else if (mirostat == 2)
                {
                    static float mirostat_mu = 2.0f * mirostat_tau;
                    llama_sample_temp(ctx, &candidates_p, temp);
                    result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
                }
                else
                {
                    // Temperature sampling
                    size_t min_keep = std::max(1, n_probs);
                    llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
                    llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
                    llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
                    llama_sample_temp(ctx, &candidates_p, temp);
                    result.tok = llama_sample_token(ctx, &candidates_p);
                }
            }
            if (grammar != nullptr) {
                llama_grammar_accept_token(ctx, grammar, result.tok);
            }
            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
@ -569,7 +575,9 @@ struct llama_server_context
            last_n_tokens.erase(last_n_tokens.begin());
            last_n_tokens.push_back(result.tok);
-            num_tokens_predicted++;
+            if (tg) {
                num_tokens_predicted++;
            }
        }
        // add it to the context
@ -629,7 +637,7 @@ struct llama_server_context
        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;
-        if (params.n_probs > 0)
+        if (params.sampling_params.n_probs > 0)
        {
            generated_token_probs.push_back(token_with_probs);
        }
@ -710,15 +718,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("usage: %s [options]\n", argv0);
    printf("\n");
    printf("options:\n");
-    printf("  -h, --help            show this help message and exit\n");
+    printf("  -h, --help                show this help message and exit\n");
-    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -t N,  --threads N        number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
+    printf("  -c N,  --ctx-size N       size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --rope-freq-scale N       RoPE frequency scaling factor (default: loaded from model)\n");
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    printf("  -b N,  --batch-size N     batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_mlock_supported())
    {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@ -863,6 +872,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_threads = std::stoi(argv[i]);
        }
        else if (arg == "--threads-batch" || arg == "-tb")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.n_threads_batch = std::stoi(argv[i]);
        }
        else if (arg == "-b" || arg == "--batch-size")
        {
            if (++i >= argc)
@ -947,7 +965,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
            params.use_mmap = false;
        }
        else if (arg == "--lora-scaled")
@ -963,7 +981,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
+            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
            params.use_mmap = false;
        }
        else if (arg == "--lora-base")
@ -1017,34 +1035,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
+    const auto & sparams = llama.params.sampling_params;
-    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
+    const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
    const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);
    return json{
        {"n_ctx", llama.n_ctx},
        {"model", llama.params.model_alias},
        {"seed", llama.params.seed},
-        {"temp", llama.params.temp},
+        {"temp", sparams.temp},
-        {"top_k", llama.params.top_k},
+        {"top_k", sparams.top_k},
-        {"top_p", llama.params.top_p},
+        {"top_p", sparams.top_p},
-        {"tfs_z", llama.params.tfs_z},
+        {"tfs_z", sparams.tfs_z},
-        {"typical_p", llama.params.typical_p},
+        {"typical_p", sparams.typical_p},
-        {"repeat_last_n", llama.params.repeat_last_n},
+        {"repeat_last_n", sparams.repeat_last_n},
-        {"repeat_penalty", llama.params.repeat_penalty},
+        {"repeat_penalty", sparams.repeat_penalty},
-        {"presence_penalty", llama.params.presence_penalty},
+        {"presence_penalty", sparams.presence_penalty},
-        {"frequency_penalty", llama.params.frequency_penalty},
+        {"frequency_penalty", sparams.frequency_penalty},
-        {"mirostat", llama.params.mirostat},
+        {"mirostat", sparams.mirostat},
-        {"mirostat_tau", llama.params.mirostat_tau},
+        {"mirostat_tau", sparams.mirostat_tau},
-        {"mirostat_eta", llama.params.mirostat_eta},
+        {"mirostat_eta", sparams.mirostat_eta},
-        {"penalize_nl", llama.params.penalize_nl},
+        {"penalize_nl", sparams.penalize_nl},
        {"stop", llama.params.antiprompt},
        {"n_predict", llama.params.n_predict},
        {"n_keep", llama.params.n_keep},
        {"ignore_eos", ignore_eos},
        {"stream", llama.stream},
-        {"logit_bias", llama.params.logit_bias},
+        {"logit_bias", sparams.logit_bias},
-        {"n_probs", llama.params.n_probs},
+        {"n_probs", sparams.n_probs},
        {"grammar", llama.params.grammar},
    };
 }
@ -1060,8 +1079,6 @@ static json format_timings(llama_server_context &llama)
 {
    const auto timings = llama_get_timings(llama.ctx);
    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
    return json{
        {"prompt_n", timings.n_p_eval},
        {"prompt_ms", timings.t_p_eval_ms},
@ -1095,7 +1112,7 @@ static json format_final_response(llama_server_context &llama, const std::string
        {"timings", format_timings(llama)},
    };
-    if (llama.params.n_probs > 0)
+    if (llama.params.sampling_params.n_probs > 0)
    {
        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
    }
@ -1111,7 +1128,7 @@ static json format_partial_response(
        {"stop", false},
    };
-    if (llama.params.n_probs > 0)
+    if (llama.params.sampling_params.n_probs > 0)
    {
        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
    }
@ -1143,26 +1160,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
    gpt_params default_params;
    const auto & default_sparams = default_params.sampling_params;
    auto & sparams = llama.params.sampling_params;
    llama.stream = json_value(body, "stream", false);
    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
-    llama.params.top_k = json_value(body, "top_k", default_params.top_k);
+    sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
-    llama.params.top_p = json_value(body, "top_p", default_params.top_p);
+    sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
-    llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
+    sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
-    llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
+    sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
-    llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
+    sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
-    llama.params.temp = json_value(body, "temperature", default_params.temp);
+    sparams.temp = json_value(body, "temperature", default_sparams.temp);
-    llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
+    sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
-    llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
+    sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
-    llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
+    sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
-    llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
+    sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
-    llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
+    sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
+    sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
+    sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
    llama.params.seed = json_value(body, "seed", default_params.seed);
    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
-    llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
+    sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
    if (body.count("prompt") != 0)
    {
@ -1173,10 +1192,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
        llama.prompt = "";
    }
-    llama.params.logit_bias.clear();
+    sparams.logit_bias.clear();
    if (json_value(body, "ignore_eos", false))
    {
-        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
+        sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
    }
    const auto &logit_bias = body.find("logit_bias");
@ -1192,11 +1211,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
                {
                    if (el[1].is_number())
                    {
-                        llama.params.logit_bias[tok] = el[1].get<float>();
+                        sparams.logit_bias[tok] = el[1].get<float>();
                    }
                    else if (el[1].is_boolean() && !el[1].get<bool>())
                    {
-                        llama.params.logit_bias[tok] = -INFINITY;
+                        sparams.logit_bias[tok] = -INFINITY;
                    }
                }
            }
@ -1216,9 +1235,32 @@ static void parse_options_completion(const json &body, llama_server_context &lla
        }
    }
    llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
    LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }
 static void parse_options_infill(const json &body, llama_server_context &llama)
 {
    if (body.count("input_prefix") != 0)
    {
        llama.params.input_prefix = body["input_prefix"];
    }
    else
    {
        llama.params.input_prefix = "";
    }
    if (body.count("input_suffix") != 0)
    {
        llama.params.input_suffix = body["input_suffix"];
    }
    else
    {
        llama.params.input_suffix = "";
    }
    parse_options_completion(body, llama);
 }
 static void log_server_request(const Request &req, const Response &res)
 {
    LOG_INFO("request", {
@ -1403,7 +1445,7 @@ int main(int argc, char **argv)
            }
            auto probs = llama.generated_token_probs;
-            if (llama.params.n_probs > 0 && llama.stopped_word) {
+            if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
                const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
                probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
            }
@ -1455,7 +1497,7 @@ int main(int argc, char **argv)
                        std::vector<completion_token_output> probs_output = {};
-                        if (llama.params.n_probs > 0) {
+                        if (llama.params.sampling_params.n_probs > 0) {
                            const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                            size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                            size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@ -1519,6 +1561,127 @@ int main(int argc, char **argv)
            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
        } });
    svr.Post("/infill", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
        llama.rewind();
        llama_reset_timings(llama.ctx);
        parse_options_infill(json::parse(req.body), llama);
        if (!llama.loadGrammar())
        {
            res.status = 400;
            return;
        }
        llama.loadInfill();
        llama.beginCompletion();
        const auto chunked_content_provider = [&](size_t, DataSink & sink) {
            size_t sent_count = 0;
            size_t sent_token_probs_index = 0;
            while (llama.has_next_token) {
                const completion_token_output token_with_probs = llama.doCompletion();
                if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
                    continue;
                }
                const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
                size_t pos = std::min(sent_count, llama.generated_text.size());
                const std::string str_test = llama.generated_text.substr(pos);
                bool is_stop_full = false;
                size_t stop_pos =
                    llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
                if (stop_pos != std::string::npos) {
                    is_stop_full = true;
                    llama.generated_text.erase(
                        llama.generated_text.begin() + pos + stop_pos,
                        llama.generated_text.end());
                    pos = std::min(sent_count, llama.generated_text.size());
                } else {
                    is_stop_full = false;
                    stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
                        STOP_PARTIAL);
                }
                if (
                    stop_pos == std::string::npos ||
                    // Send rest of the text if we are at the end of the generation
                    (!llama.has_next_token && !is_stop_full && stop_pos > 0)
                ) {
                    const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
                    sent_count += to_send.size();
                    std::vector<completion_token_output> probs_output = {};
                    if (llama.params.sampling_params.n_probs > 0) {
                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
                        if (probs_pos < probs_stop_pos) {
                            probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
                        }
                        sent_token_probs_index = probs_stop_pos;
                    }
                    const json data = format_partial_response(llama, to_send, probs_output);
                    const std::string str =
                        "data: " +
                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
                        "\n\n";
                    LOG_VERBOSE("data stream", {
                        { "to_send", str }
                    });
                    if (!sink.write(str.data(), str.size())) {
                        LOG_VERBOSE("stream closed", {});
                        llama_print_timings(llama.ctx);
                        return false;
                    }
                }
                if (!llama.has_next_token) {
                    // Generation is done, send extra information.
                    const json data = format_final_response(
                        llama,
                        "",
                        std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
                    );
                    const std::string str =
                        "data: " +
                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
                        "\n\n";
                    LOG_VERBOSE("data stream", {
                        { "to_send", str }
                    });
                    if (!sink.write(str.data(), str.size())) {
                        LOG_VERBOSE("stream closed", {});
                        llama_print_timings(llama.ctx);
                        return false;
                    }
                }
            }
            llama_print_timings(llama.ctx);
            sink.done();
            return true;
        };
        const auto on_complete = [&](bool) {
            llama.mutex.unlock();
        };
        lock.release();
        res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
        });
    svr.Get("/model.json", [&llama](const Request &, Response &res)
            {
        const json data = format_generation_settings(llama);
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }
    llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
    const auto t_dec_start = ggml_time_us();
    while (true) {
@ -134,7 +136,7 @@ int main(int argc, char ** argv) {
        while (true) {
            // sample from the target model
-            llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
+            llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
            // remember which tokens were sampled - used for repetition penalties during sampling
            last_tokens.erase(last_tokens.begin());
@ -172,7 +174,7 @@ int main(int argc, char ** argv) {
                LOG("out of drafted tokens\n");
            }
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
            ++n_past_dft;
@ -211,7 +213,13 @@ int main(int argc, char ** argv) {
            if (grammar_dft) {
                llama_grammar_free(grammar_dft);
            }
-            grammar_dft = llama_grammar_copy(grammar_tgt);
+            // Note: Hardcoded to sequence id 0, if this ever supports parallel generation
            //       that will need to change.
            auto it = ctx_sampling.sequence_contexts.find(0);
            GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
            // This is necessary because each sequence id in sequence_contexts
            // uses a copy of the original grammar.
            grammar_dft = llama_grammar_copy(it->second.grammar);
            LOG("copied target grammar to draft grammar\n");
        }
@ -257,7 +265,7 @@ int main(int argc, char ** argv) {
            }
            // evaluate the drafted token on the draft model
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
            ++n_past_cur;
@ -267,7 +275,7 @@ int main(int argc, char ** argv) {
        }
        // evaluate the target model on the drafted tokens
-        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
+        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
        ++n_past_tgt;
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@ -364,7 +364,7 @@ class ModelParams:
        gguf_writer.add_feed_forward_length(self.get_n_ff())
 def tensor_name(key, bid=None):
-    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
+    return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
 class Layer:
    def __init__(self, params, bid):
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -334,7 +334,8 @@ static struct ggml_tensor * llama_build_train_graphs(
    // KQ_pos - contains the positions
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
-    {
+    ggml_allocr_alloc(alloc, KQ_pos);
    if (!ggml_allocr_is_measure(alloc)) {
        int * data = (int *) KQ_pos->data;
        for (int i = 0; i < N; ++i) {
            data[i] = n_past + i;
--- a/flake.nix
+++ b/flake.nix
@ -62,7 +62,7 @@
          mkdir -p $out/include
          cp ${src}/llama.h $out/include/
        '';
-        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
+        cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
      in
      {
        packages.default = pkgs.stdenv.mkDerivation {
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -1,4 +1,5 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml.h"
 #include <assert.h>
 #include <stdarg.h>
@ -6,25 +7,6 @@
 #include <stdlib.h>
 #include <string.h>
 #ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/types.h>
            #include <sys/mman.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <memoryapi.h>
 #endif
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@ -80,8 +62,9 @@ struct free_block {
 #define MAX_FREE_BLOCKS 256
 struct ggml_allocr {
    struct ggml_backend_buffer * buffer;
    bool buffer_owned;
    void * data;
    size_t size;
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[MAX_FREE_BLOCKS];
@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif
 static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    return ggml_nbytes(tensor);
    UNUSED(alloc);
 }
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
-    void * ptr = tensor->data;
+    return tensor->buffer == alloc->buffer;
    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
 }
 static bool ggml_is_view(struct ggml_tensor * t) {
@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
 }
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
 #ifdef GGML_ALLOCATOR_DEBUG
    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-#endif
+
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    tensor->data = addr;
    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
    tensor->buffer = alloc->buffer;
    ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
 #ifdef GGML_ALLOCATOR_DEBUG
    add_allocated_tensor(alloc, tensor);
@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    void * ptr = tensor->data;
    if (ggml_allocr_is_own(alloc, tensor) == false) {
        // the tensor was not allocated in this buffer
        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
        // the easiest way to deal with this is just to ignore it
        AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
        return;
    }
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    void * ptr = tensor->data;
    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
-    AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
+
    ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, tensor);
@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
    alloc->n_free_blocks = 1;
    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
-    alloc->free_blocks[0].size = alloc->size - align_offset;
+    alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
 }
 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
    *alloc = (struct ggml_allocr){
-        /*.data          = */ data,
+        /*.buffer        = */ buffer,
-        /*.size          = */ size,
+        /*.buffer_owned  = */ true,
        /*.base          = */ ggml_backend_buffer_get_base(buffer),
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
    return alloc;
 }
 // OS specific functions to allocate and free uncommitted virtual memory
 static void * alloc_vmem(size_t size) {
 #if defined(_WIN32)
    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
 #elif defined(_POSIX_MAPPED_FILES)
    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
    if (ptr == MAP_FAILED) {
        return NULL;
    }
    return ptr;
 #else
    // use a fixed address for other platforms
    uintptr_t base_addr = (uintptr_t)-size - 0x100;
    return (void *)base_addr;
 #endif
 }
 static void free_vmem(void * base_addr, size_t size) {
 #if defined(_WIN32)
    VirtualFree(base_addr, 0, MEM_RELEASE);
    UNUSED(size);
 #elif defined(_POSIX_MAPPED_FILES)
    munmap(base_addr, size);
 #else
    // nothing to do
    UNUSED(base_addr);
    UNUSED(size);
 #endif
 }
 // allocate uncommitted virtual memory to measure the size of the graph
 static void alloc_measure_vmem(void ** base_addr, size_t * size) {
    // 128GB for 64-bit, 1GB for 32-bit
    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
    do {
        *base_addr = alloc_vmem(*size);
        if (*base_addr != NULL) {
            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
            return;
        }
        // try again with half the size
        *size /= 2;
    } while (*size > 0);
    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
 }
 static void free_measure_vmem(void * base_addr, size_t size) {
    free_vmem(base_addr, size);
 }
 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+    struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
    alloc->measure = true;
-    void * base_addr;
+    return alloc;
-    size_t size;
+}
-    alloc_measure_vmem(&base_addr, &size);
+struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
    *alloc = (struct ggml_allocr){
-        /*.data          = */ base_addr,
+        /*.buffer        = */ buffer,
-        /*.size          = */ size,
+        /*.buffer_owned  = */ false,
-        /*.alignment     = */ alignment,
+        /*.base          = */ ggml_backend_buffer_get_base(buffer),
        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
-        /*.measure       = */ true,
+        /*.measure       = */ false,
        /*.parse_seq     = */ {0},
        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }
 void ggml_allocr_free(struct ggml_allocr * alloc) {
-    if (alloc->measure) {
+    if (alloc->buffer_owned) {
-        free_measure_vmem(alloc->data, alloc->size);
+        ggml_backend_buffer_free(alloc->buffer);
    }
    free(alloc);
 }
@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_ROPE:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_CONT:
            return true;
        default:
@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
    }
 }
 static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
    assert(view->view_src != NULL && view->view_src->data != NULL);
    view->backend = view->view_src->backend;
    view->buffer  = view->view_src->buffer;
    view->data    = (char *)view->view_src->data + view->view_offs;
    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
    assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
    ggml_backend_buffer_init_tensor(alloc->buffer, view);
 }
 static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
    struct hash_node * ht = alloc->hash_table;
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
-            assert(node->view_src->data != NULL);
+            init_view(alloc, node);
            node->data = (char *)node->view_src->data + node->view_offs;
        } else {
            // see if we can reuse a parent's buffer (inplace)
            if (ggml_op_can_inplace(node->op)) {
@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                                node->data = parent->data;
+                                node->view_src = view_src;
                                view_src_hn->n_views += 1;
                                init_view(alloc, node);
                                return;
                            }
                        }
                        else {
                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                            node->data = parent->data;
+                            node->view_src = parent;
                            p_hn->n_views += 1;
                            init_view(alloc, node);
                            return;
                        }
                    }
@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    }
 }
-static size_t ggml_allocr_alloc_graph_tensors_n(
+size_t ggml_allocr_alloc_graph_n(
    struct ggml_allocr * alloc,
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
            if (ggml_is_view(node)) {
                struct ggml_tensor * view_src = node->view_src;
                hash_get(ht, view_src)->n_views += 1;
                if (node->buffer == NULL && node->data != NULL) {
                    // view of a pre-allocated tensor, didn't call init_view() yet
                    init_view(alloc, node);
                }
            }
            for (int j = 0; j < GGML_MAX_SRC; j++) {
@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
                    break;
                }
                hash_get(ht, parent)->n_children += 1;
                if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
                    init_view(alloc, parent);
                }
            }
        }
    }
@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
 }
 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
 }
 size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -6,21 +6,27 @@
 extern "C" {
 #endif
 struct ggml_backend_buffer;
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
 GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
-GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_free       (struct ggml_allocr * alloc);
-GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure (struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset      (struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API void   ggml_allocr_alloc      (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
-GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
+GGML_API size_t ggml_allocr_max_size   (struct ggml_allocr * alloc);
 GGML_API size_t ggml_allocr_alloc_graph_n(
                    struct ggml_allocr * alloc,
                    struct ggml_cgraph ** graphs, int n_graphs,
                    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
 #ifdef  __cplusplus
 }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -0,0 +1,385 @@
 #include "ggml-backend.h"
 #include "ggml-alloc.h"
 #include <assert.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #define UNUSED GGML_UNUSED
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 // backend buffer
 ggml_backend_buffer_t ggml_backend_buffer_init(
        struct ggml_backend                  * backend,
        struct ggml_backend_buffer_i           iface,
               ggml_backend_buffer_context_t   context,
               size_t                          size) {
    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
    GGML_ASSERT(iface.get_base != NULL);
    (*buffer) = (struct ggml_backend_buffer) {
        /* .interface = */ iface,
        /* .backend   = */ backend,
        /* .context   = */ context,
        /* .size      = */ size,
    };
    return buffer;
 }
 void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
    if (buffer->iface.free_buffer != NULL) {
        buffer->iface.free_buffer(buffer);
    }
    free(buffer);
 }
 size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
    return ggml_backend_get_alignment(buffer->backend);
 }
 void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
    return buffer->iface.get_base(buffer);
 }
 size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
    return buffer->size;
 }
 size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    if (buffer->iface.get_alloc_size) {
        return buffer->iface.get_alloc_size(buffer, tensor);
    }
    return ggml_nbytes(tensor);
 }
 void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    if (buffer->iface.init_tensor) {
        buffer->iface.init_tensor(buffer, tensor);
    }
 }
 void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    if (buffer->iface.free_tensor) {
        buffer->iface.free_tensor(buffer, tensor);
    }
 }
 // backend
 ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
    return tensor->buffer->backend;
 }
 const char * ggml_backend_name(ggml_backend_t backend) {
    return backend->iface.get_name(backend);
 }
 void ggml_backend_free(ggml_backend_t backend) {
    backend->iface.free(backend);
 }
 ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
    return backend->iface.alloc_buffer(backend, size);
 }
 size_t ggml_backend_get_alignment(ggml_backend_t backend) {
    return backend->iface.get_alignment(backend);
 }
 void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
 }
 void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
 }
 void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
 }
 void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
 }
 void ggml_backend_synchronize(ggml_backend_t backend) {
    backend->iface.synchronize(backend);
 }
 ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    return backend->iface.graph_plan_create(backend, cgraph);
 }
 void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    backend->iface.graph_plan_free(backend, plan);
 }
 void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    backend->iface.graph_plan_compute(backend, plan);
 }
 void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    backend->iface.graph_compute(backend, cgraph);
 }
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    return backend->iface.supports_op(backend, op);
 }
 // backend copy
 static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
    if (a->type != b->type) {
        return false;
    }
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        if (a->ne[i] != b->ne[i]) {
            return false;
        }
        if (a->nb[i] != b->nb[i]) {
            return false;
        }
    }
    return true;
 }
 void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
    //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
    //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
    // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
    if (src == dst) {
        return;
    }
    // TODO: allow backends to support copy to/from same backend
    if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
        ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
    } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
        ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
    } else {
        // shouldn't be hit when copying from/to CPU
        #ifndef NDEBUG
        fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
        #endif
        size_t nbytes = ggml_nbytes(src);
        void * data = malloc(nbytes);
        ggml_backend_tensor_get(src, data, 0, nbytes);
        ggml_backend_tensor_set(dst, data, 0, nbytes);
        free(data);
    }
 }
 // backend CPU
 struct ggml_backend_cpu_context {
    int n_threads;
    void * work_data;
    size_t work_size;
 };
 static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
    return "CPU";
    UNUSED(backend);
 }
 static void ggml_backend_cpu_free(ggml_backend_t backend) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
    free(cpu_ctx->work_data);
    free(cpu_ctx);
    free(backend);
 }
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
    return (void *)buffer->context;
 }
 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    free(buffer->context);
    UNUSED(buffer);
 }
 static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
    /* .free_buffer    = */ ggml_backend_cpu_buffer_free_buffer,
    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
    /* .init_tensor    = */ NULL, // no initialization required
    /* .free_tensor    = */ NULL, // no cleanup required
 };
 // for buffers from ptr, free is not called
 static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
    /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
    /* .init_tensor    = */ NULL,
    /* .free_tensor    = */ NULL,
 };
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
    return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
 }
 static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
    return TENSOR_ALIGNMENT;
    UNUSED(backend);
 }
 static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    memcpy((char *)tensor->data + offset, data, size);
    UNUSED(backend);
 }
 static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    memcpy(data, (const char *)tensor->data + offset, size);
    UNUSED(backend);
 }
 static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
    UNUSED(backend);
 }
 static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
    UNUSED(backend);
 }
 static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
    // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
    UNUSED(backend);
 }
 struct ggml_backend_plan_cpu {
    struct ggml_cplan cplan;
    struct ggml_cgraph cgraph;
 };
 static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
    cpu_plan->cgraph = *cgraph;
    if (cpu_plan->cplan.work_size > 0) {
        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
    }
    return cpu_plan;
 }
 static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
    free(cpu_plan->cplan.work_data);
    free(cpu_plan);
    UNUSED(backend);
 }
 static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
    UNUSED(backend);
 }
 static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
    if (cpu_ctx->work_size < cplan.work_size) {
        // TODO: may be faster to free and use malloc to avoid the copy
        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
        cpu_ctx->work_size = cplan.work_size;
    }
    cplan.work_data = cpu_ctx->work_data;
    ggml_graph_compute(cgraph, &cplan);
 }
 static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    return true;
    UNUSED(backend);
    UNUSED(op);
 }
 static struct ggml_backend_i cpu_backend_i = {
    /* .get_name            = */ ggml_backend_cpu_name,
    /* .free                = */ ggml_backend_cpu_free,
    /* .alloc_buffer        = */ ggml_backend_cpu_alloc_buffer,
    /* .get_alignment       = */ ggml_backend_cpu_get_alignment,
    /* .set_tensor_async    = */ ggml_backend_cpu_set_tensor_async,
    /* .get_tensor_async    = */ ggml_backend_cpu_get_tensor_async,
    /* .synchronize         = */ ggml_backend_cpu_synchronize,
    /* .cpy_tensor_from     = */ ggml_backend_cpu_cpy_tensor_from,
    /* .cpy_tensor_to       = */ ggml_backend_cpu_cpy_tensor_to,
    /* .graph_plan_create   = */ ggml_backend_cpu_graph_plan_create,
    /* .graph_plan_free     = */ ggml_backend_cpu_graph_plan_free,
    /* .graph_plan_compute  = */ ggml_backend_cpu_graph_plan_compute,
    /* .graph_compute       = */ ggml_backend_cpu_graph_compute,
    /* .supports_op         = */ ggml_backend_cpu_supports_op,
 };
 ggml_backend_t ggml_backend_cpu_init(void) {
    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
    ctx->n_threads = GGML_DEFAULT_N_THREADS;
    ctx->work_data = NULL;
    ctx->work_size = 0;
    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
    *cpu_backend = (struct ggml_backend) {
        /* .interface = */ cpu_backend_i,
        /* .context   = */ ctx
    };
    return cpu_backend;
 }
 bool ggml_backend_is_cpu(ggml_backend_t backend) {
    return backend->iface.get_name == ggml_backend_cpu_name;
 }
 void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
    ctx->n_threads = n_threads;
 }
 ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
    return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
 }
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -0,0 +1,143 @@
 #pragma once
 #include "ggml.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
    struct ggml_backend;
    struct ggml_backend_buffer;
    // type-erased backend-specific types / wrappers
    typedef void * ggml_backend_context_t;
    typedef void * ggml_backend_graph_plan_t;
    typedef void * ggml_backend_buffer_context_t;
    // avoid accessing internals of these types
    typedef struct ggml_backend        * ggml_backend_t;
    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
    //
    // backend buffer
    //
    struct ggml_backend_buffer_i {
        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
    };
    // TODO: hide behind API
    struct ggml_backend_buffer {
        struct ggml_backend_buffer_i iface;
        ggml_backend_t                backend;
        ggml_backend_buffer_context_t context;
        size_t size;
    };
    // backend buffer functions
    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
            struct ggml_backend                  * backend,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);
    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    //
    // backend
    //
    struct ggml_backend_i {
        const char * (*get_name)(ggml_backend_t backend);
        void (*free)(ggml_backend_t backend);
        // buffer allocation
        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
        // get buffer alignment
        size_t (*get_alignment)(ggml_backend_t backend);
        // tensor data access
        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        void (*synchronize)     (ggml_backend_t backend);
        // (optional) copy tensor between different backends, allow for single-copy tranfers
        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
        // compute graph with a plan
        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan
        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
        // check if the backend supports an operation
        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
    };
    // TODO: hide behind API
    struct ggml_backend {
        struct ggml_backend_i iface;
        ggml_backend_context_t context;
    };
    // backend helper functions
    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
    GGML_API void         ggml_backend_free(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    //
    // CPU backend
    //
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -1,6 +1,7 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
@ -42,6 +43,9 @@ GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, s
 GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 // backend API
 GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -20,6 +20,7 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stddef.h>
 #include <stdbool.h>
@ -35,10 +36,15 @@ struct ggml_cgraph;
 extern "C" {
 #endif
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+//
 // internal API
 // temporary exposed to user-code
 //
 struct ggml_metal_context;
 void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 // number of command buffers to use
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
@ -83,6 +89,17 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 //
 // backend API
 // user-code should use only these functions
 //
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -81,18 +81,18 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(norm);
-    GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
+    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
-    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@ -109,6 +109,8 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
    GGML_METAL_DECL_KERNEL(cpy_f16_f16);
    GGML_METAL_DECL_KERNEL(concat);
    GGML_METAL_DECL_KERNEL(sqr);
 #undef GGML_METAL_DECL_KERNEL
 };
@ -183,56 +185,44 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
-#ifdef GGML_SWIFT
+    // load library
    // load the default.metallib file
    {
-        NSError * error = nil;
+        NSBundle * bundle = nil;
-
+#ifdef SWIFT_PACKAGE
-        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
+        bundle = SWIFTPM_MODULE_BUNDLE;
        NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
        NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
        NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
        NSURL * libURL = [NSURL fileURLWithPath:libPath];
        // Load the metallib file into a Metal library
        ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
        if (error) {
            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
 #else
-    UNUSED(msl_library_source);
+        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
-
+#endif
    // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource
    {
        NSError * error = nil;
        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
        if (libPath != nil) {
            NSURL * libURL = [NSURL fileURLWithPath:libPath];
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
        } else {
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
-        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
+            NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
+            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
-        NSString * path   = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
-        GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
+            if (error) {
-
+                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
+                return NULL;
-        if (error) {
+            }
            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
            MTLCompileOptions* options = nil;
 #ifdef GGML_QKK_64
-        MTLCompileOptions* options = [MTLCompileOptions new];
+            options = [MTLCompileOptions new];
-        options.preprocessorMacros = @{ @"QK_K" : @(64) };
+            options.preprocessorMacros = @{ @"QK_K" : @(64) };
        ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
 #else
        ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
 #endif
            ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
        }
        if (error) {
            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
 #endif
    // load kernels
    {
@ -272,40 +262,57 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(norm);
-        GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
+        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
-        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
+        if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
-        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
+            GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
        }
        GGML_METAL_ADD_KERNEL(rope_f32);
        GGML_METAL_ADD_KERNEL(rope_f16);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
        GGML_METAL_ADD_KERNEL(cpy_f16_f16);
        GGML_METAL_ADD_KERNEL(concat);
        GGML_METAL_ADD_KERNEL(sqr);
 #undef GGML_METAL_ADD_KERNEL
    }
    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 #if TARGET_OS_OSX
    // print MTL GPU family:
    GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
    // determine max supported GPU family
    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
    for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
        if ([ctx->device supportsFamily:i]) {
            GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
            break;
        }
    }
    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
    if (ctx->device.maxTransferRate != 0) {
        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
@ -347,34 +354,38 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(norm);
-    GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
+    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
-    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
+    if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
-    GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+        GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
    }
    GGML_METAL_DEL_KERNEL(rope_f32);
    GGML_METAL_DEL_KERNEL(rope_f16);
    GGML_METAL_DEL_KERNEL(alibi_f32);
    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
    GGML_METAL_DEL_KERNEL(concat);
    GGML_METAL_DEL_KERNEL(sqr);
 #undef GGML_METAL_DEL_KERNEL
@ -431,7 +442,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
    for (int i = 0; i < ctx->n_buffers; ++i) {
        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
-        //metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
+        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
            *offs = (size_t) ioffs;
@ -766,6 +777,44 @@ void ggml_metal_graph_compute(
                        {
                            // noop
                        } break;
                    case GGML_OP_CONCAT:
                        {
                            const int64_t nb = ne00;
                            [encoder setComputePipelineState:ctx->pipeline_concat];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
                            const int nth = MIN(1024, ne0);
                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_ADD:
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));
@ -861,9 +910,10 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
-                            const int64_t n = ggml_nelements(dst)/4;
+                            const int64_t n = ggml_nelements(dst);
                            GGML_ASSERT(n % 4 == 0);
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_UNARY:
                        switch (ggml_get_unary_op(gf->nodes[i])) {
@ -873,9 +923,10 @@ void ggml_metal_graph_compute(
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                    const int64_t n = ggml_nelements(dst)/4;
+                                    const int64_t n = ggml_nelements(dst);
                                    GGML_ASSERT(n % 4 == 0);
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            case GGML_UNARY_OP_RELU:
                                {
@ -893,9 +944,10 @@ void ggml_metal_graph_compute(
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                                    const int64_t n = ggml_nelements(dst)/4;
+                                    const int64_t n = ggml_nelements(dst);
                                    GGML_ASSERT(n % 4 == 0);
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            default:
                                {
@ -903,6 +955,17 @@ void ggml_metal_graph_compute(
                                    GGML_ASSERT(false);
                                }
                        } break;
                    case GGML_OP_SQR:
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));
                            [encoder setComputePipelineState:ctx->pipeline_sqr];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
                            const int64_t n = ggml_nelements(dst);
                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
                            const int nth = MIN(32, ne00);
@ -944,21 +1007,46 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_MUL_MAT:
                        {
                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
                            GGML_ASSERT(ne00 == ne10);
                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
                            uint gqa = ne12/ne02;
                            GGML_ASSERT(ne03 == ne13);
                            const uint gqa = ne12/ne02;
                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                            // to the matrix-vector kernel
                            int ne11_mm_min = 1;
 #if 0
                            // the numbers below are measured on M2 Ultra for 7B and 13B models
                            // these numbers do not translate to other devices or model sizes
                            // TODO: need to find a better approach
                            if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
                                switch (src0t) {
                                    case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
                                    case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
                                    case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
                                    case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
                                    case GGML_TYPE_Q4_0:
                                    case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
                                    case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
                                    case GGML_TYPE_Q5_0:                          // not tested yet
                                    case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
                                    case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
                                    case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
                                    default:             ne11_mm_min = 1;  break;
                                }
                            }
 #endif
                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if (!ggml_is_transposed(src0) &&
+                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                                !ggml_is_transposed(src0) &&
                                !ggml_is_transposed(src1) &&
                                src1t == GGML_TYPE_F32 &&
-                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                ne00 % 32 == 0 && ne00 >= 64 &&
-                                ne00%32 == 0 &&
+                                ne11 > ne11_mm_min) {
-                                ne11 > 2) {
+                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                switch (src0->type) {
                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
@ -987,17 +1075,18 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
                                [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:13];
                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
                                int nth0 = 32;
                                int nth1 = 1;
                                int nrows = 1;
                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                // use custom matrix x vector kernel
                                switch (src0t) {
                                    case GGML_TYPE_F32:
                                        {
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
                                            nrows = 4;
                                        } break;
                                    case GGML_TYPE_F16:
@ -1005,12 +1094,12 @@ void ggml_metal_graph_compute(
                                            nth0 = 32;
                                            nth1 = 1;
                                            if (ne11 * ne12 < 4) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
                                                nrows = ne11;
                                            } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
                                                nrows = 4;
                                            }
                                        } break;
@ -1021,7 +1110,7 @@ void ggml_metal_graph_compute(
                                            nth0 = 8;
                                            nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
                                        } break;
                                    case GGML_TYPE_Q4_1:
                                        {
@ -1030,7 +1119,7 @@ void ggml_metal_graph_compute(
                                            nth0 = 8;
                                            nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
                                        } break;
                                    case GGML_TYPE_Q8_0:
                                        {
@ -1039,7 +1128,7 @@ void ggml_metal_graph_compute(
                                            nth0 = 8;
                                            nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
                                        } break;
                                    case GGML_TYPE_Q2_K:
                                        {
@ -1048,7 +1137,7 @@ void ggml_metal_graph_compute(
                                            nth0 = 2;
                                            nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
                                        } break;
                                    case GGML_TYPE_Q3_K:
                                        {
@ -1057,7 +1146,7 @@ void ggml_metal_graph_compute(
                                            nth0 = 2;
                                            nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
                                        } break;
                                    case GGML_TYPE_Q4_K:
                                        {
@ -1066,7 +1155,7 @@ void ggml_metal_graph_compute(
                                            nth0 = 4; //1;
                                            nth1 = 8; //32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
                                        {
@ -1075,7 +1164,7 @@ void ggml_metal_graph_compute(
                                            nth0 = 2;
                                            nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
                                        } break;
                                    case GGML_TYPE_Q6_K:
                                        {
@ -1084,7 +1173,7 @@ void ggml_metal_graph_compute(
                                            nth0 = 2;
                                            nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
                                        } break;
                                    default:
                                        {
@ -1113,7 +1202,7 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];
                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
+                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q4_K) {
@ -1166,6 +1255,8 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_RMS_NORM:
                        {
                            GGML_ASSERT(ne00 % 4 == 0);
                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));
@ -1208,17 +1299,14 @@ void ggml_metal_graph_compute(
                            const int nth = MIN(1024, ne00);
-                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
+                            //const int n_past = ((int32_t *) dst->op_params)[0];
                            const int n_head = ((int32_t *) dst->op_params)[1];
                            float max_bias;
                            memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
                            if (__builtin_popcount(n_head) != 1) {
                                GGML_ASSERT(false && "only power-of-two n_head implemented");
                            }
                            const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
                            const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
                            const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
                            [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -1239,7 +1327,9 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&m0  length:sizeof(    float) atIndex:18];
+                            [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
                            [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
                            [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
@ -1372,3 +1462,140 @@ void ggml_metal_graph_compute(
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // backend interface
 static const char * ggml_backend_metal_name(ggml_backend_t backend) {
    return "Metal";
    UNUSED(backend);
 }
 static void ggml_backend_metal_free(ggml_backend_t backend) {
    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
    ggml_metal_free(ctx);
    free(backend);
 }
 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
    return (void *)buffer->context;
 }
 static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    free(buffer->context);
    UNUSED(buffer);
 }
 static struct ggml_backend_buffer_i metal_backend_buffer_i = {
    /* .free_buffer    = */ ggml_backend_metal_buffer_free_buffer,
    /* .get_base       = */ ggml_backend_metal_buffer_get_base,
    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
    /* .init_tensor    = */ NULL, // no initialization required
    /* .free_tensor    = */ NULL, // no cleanup required
 };
 static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
    void * data = ggml_metal_host_malloc(size);
    // TODO: set proper name of the buffers
    ggml_metal_add_buffer(ctx, "backend", data, size, 0);
    return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
 }
 static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
    return 32;
    UNUSED(backend);
 }
 static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    memcpy((char *)tensor->data + offset, data, size);
    UNUSED(backend);
 }
 static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    memcpy(data, (const char *)tensor->data + offset, size);
    UNUSED(backend);
 }
 static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
    UNUSED(backend);
 }
 static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
    UNUSED(backend);
 }
 static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
    UNUSED(backend);
 }
 static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
    ggml_metal_graph_compute(metal_ctx, cgraph);
 }
 static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    return true;
    UNUSED(backend);
    UNUSED(op);
 }
 static struct ggml_backend_i metal_backend_i = {
    /* .get_name            = */ ggml_backend_metal_name,
    /* .free                = */ ggml_backend_metal_free,
    /* .alloc_buffer        = */ ggml_backend_metal_alloc_buffer,
    /* .get_alignment       = */ ggml_backend_metal_get_alignment,
    /* .set_tensor_async    = */ ggml_backend_metal_set_tensor_async,
    /* .get_tensor_async    = */ ggml_backend_metal_get_tensor_async,
    /* .synchronize         = */ ggml_backend_metal_synchronize,
    /* .cpy_tensor_from     = */ ggml_backend_metal_cpy_tensor_from,
    /* .cpy_tensor_to       = */ ggml_backend_metal_cpy_tensor_to,
    /* .graph_plan_create   = */ NULL, // the metal implementation does not require creating graph plans atm
    /* .graph_plan_free     = */ NULL,
    /* .graph_plan_compute  = */ NULL,
    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
    /* .supports_op         = */ ggml_backend_metal_supports_op,
 };
 ggml_backend_t ggml_backend_metal_init(void) {
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
    ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
    *metal_backend = (struct ggml_backend) {
        /* .interface = */ metal_backend_i,
        /* .context   = */ ctx,
    };
    return metal_backend;
 }
 bool ggml_backend_is_metal(ggml_backend_t backend) {
    return backend->iface.get_name == ggml_backend_metal_name;
 }
 void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
    ggml_metal_set_n_cb(ctx, n_cb);
 }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -13,8 +13,8 @@ typedef struct {
 #define QK4_1 32
 typedef struct {
-    half d;          // delta
+    half d;                 // delta
-    half m;          // min
+    half m;                 // min
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
@ -132,6 +132,13 @@ kernel void kernel_relu(
    dst[tpig] = max(0.0f, src0[tpig]);
 }
 kernel void kernel_sqr(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] * src0[tpig];
 }
 constant float GELU_COEF_A    = 0.044715f;
 constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
@ -338,10 +345,11 @@ kernel void kernel_rms_norm(
        uint sgitg[[simdgroup_index_in_threadgroup]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
+    device const float4 * x        = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
-    device const float * x_scalar = (device const float *) x;
+    device const float  * x_scalar = (device const float  *) x;
-    float4 sumf=0;
+
-    float all_sum=0;
+    float4 sumf = 0;
    float all_sum = 0;
    // parallel sum
    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
@ -354,6 +362,7 @@ kernel void kernel_rms_norm(
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // broadcast, simd group number is ntg / 32
    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
       if (tpitg < i) {
@ -361,7 +370,9 @@ kernel void kernel_rms_norm(
       }
    }
    if (tpitg == 0) {
-        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
            sum[0] += x_scalar[i];
        }
        sum[0] /= ne00;
    }
@ -376,7 +387,9 @@ kernel void kernel_rms_norm(
        y[i00] = x[i00] * scale;
    }
    if (tpitg == 0) {
-        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
            y_scalar[i00] = x_scalar[i00] * scale;
        }
    }
 }
@ -416,8 +429,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
 }
 // putting them in the kernel cause a significant performance penalty
-#define N_DST 4 // each SIMD group works on 4 rows
+#define N_DST 4        // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
+#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 //Note: This is a template, but strictly speaking it only applies to
 //      quantizations where the block size is 32. It also does not
@ -428,18 +441,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
                    int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
                    uint3 tgpig, uint tiisg, uint sgitg) {
    const int nb = ne00/QK4_0;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;
    const int first_row = (r0 * nsg + sgitg) * nr;
    const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
    device const block_q_type * x = (device const block_q_type *) src0 + offset0;
    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
    float yl[16];       // src1 vector cache
    float sumf[nr]={0.f};
-    const int ix = tiisg/2;
+    float yl[16]; // src1 vector cache
-    const int il = 8*(tiisg%2);
+    float sumf[nr] = {0.f};
    const int ix = (tiisg/2);
    const int il = (tiisg%2)*8;
    device const float * yb = y + ix * QK4_0 + il;
@ -450,6 +468,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
            sumy += yb[i] + yb[i+1];
            yl[i+0] = yb[i+ 0];
            yl[i+1] = yb[i+ 1]/256.f;
            sumy += yb[i+16] + yb[i+17];
            yl[i+8] = yb[i+16]/16.f;
            yl[i+9] = yb[i+17]/4096.f;
@ -465,12 +484,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
    for (int row = 0; row < nr; ++row) {
        const float tot = simd_sum(sumf[row]);
        if (tiisg == 0 && first_row + row < ne01) {
-            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
+            dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
        }
    }
 }
-kernel void kernel_mul_mat_q4_0_f32(
+kernel void kernel_mul_mv_q4_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -483,12 +502,12 @@ kernel void kernel_mul_mat_q4_0_f32(
        constant   int64_t & ne1[[buffer(16)]],
        constant   uint    & gqa[[buffer(17)]],
        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
-        uint sgitg[[simdgroup_index_in_threadgroup]]) {
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }
-kernel void kernel_mul_mat_q4_1_f32(
+kernel void kernel_mul_mv_q4_1_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -508,7 +527,7 @@ kernel void kernel_mul_mat_q4_1_f32(
 #define NB_Q8_0 8
-kernel void kernel_mul_mat_q8_0_f32(
+kernel void kernel_mul_mv_q8_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -572,7 +591,7 @@ kernel void kernel_mul_mat_q8_0_f32(
 #define N_F32_F32 4
-kernel void kernel_mul_mat_f32_f32(
+kernel void kernel_mul_mv_f32_f32(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
@ -643,7 +662,7 @@ kernel void kernel_mul_mat_f32_f32(
    }
 }
-kernel void kernel_mul_mat_f16_f32_1row(
+kernel void kernel_mul_mv_f16_f32_1row(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
@ -662,7 +681,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint tiisg[[thread_index_in_simdgroup]]) {
+        uint  tiisg[[thread_index_in_simdgroup]]) {
    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;
@ -697,7 +716,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
 #define N_F16_F32 4
-kernel void kernel_mul_mat_f16_f32(
+kernel void kernel_mul_mv_f16_f32(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
@ -769,7 +788,7 @@ kernel void kernel_mul_mat_f16_f32(
 }
 // Assumes row size (ne00) is a multiple of 4
-kernel void kernel_mul_mat_f16_f32_l4(
+kernel void kernel_mul_mv_f16_f32_l4(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
@ -830,7 +849,9 @@ kernel void kernel_alibi_f32(
        constant  uint64_t & nb1,
        constant  uint64_t & nb2,
        constant  uint64_t & nb3,
-        constant      float & m0,
+        constant     float & m0,
        constant     float & m1,
        constant       int & n_heads_log2_floor,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
@ -846,7 +867,12 @@ kernel void kernel_alibi_f32(
    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-    float m_k = pow(m0, i2 + 1);
+    float m_k;
    if (i2 < n_heads_log2_floor) {
        m_k = pow(m0, i2 + 1);
    } else {
        m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
    }
    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
@ -1091,6 +1117,62 @@ kernel void kernel_cpy_f32_f32(
    }
 }
 kernel void kernel_concat(
    device const char * src0,
    device const char * src1,
    device       char * dst,
    constant   int64_t & ne00,
    constant   int64_t & ne01,
    constant   int64_t & ne02,
    constant   int64_t & ne03,
    constant  uint64_t & nb00,
    constant  uint64_t & nb01,
    constant  uint64_t & nb02,
    constant  uint64_t & nb03,
    constant   int64_t & ne10,
    constant   int64_t & ne11,
    constant   int64_t & ne12,
    constant   int64_t & ne13,
    constant  uint64_t & nb10,
    constant  uint64_t & nb11,
    constant  uint64_t & nb12,
    constant  uint64_t & nb13,
    constant   int64_t & ne0,
    constant   int64_t & ne1,
    constant   int64_t & ne2,
    constant   int64_t & ne3,
    constant  uint64_t & nb0,
    constant  uint64_t & nb1,
    constant  uint64_t & nb2,
    constant  uint64_t & nb3,
    uint3 tgpig[[threadgroup_position_in_grid]],
    uint3 tpitg[[thread_position_in_threadgroup]],
    uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t i03 = tgpig.z;
    const int64_t i02 = tgpig.y;
    const int64_t i01 = tgpig.x;
    const int64_t i13 = i03 % ne13;
    const int64_t i12 = i02 % ne12;
    const int64_t i11 = i01 % ne11;
    device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
        if (i02 < ne02) {
            ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
            src0_ptr += ntg.x*nb00;
        } else {
            ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
            src1_ptr += ntg.x*nb10;
        }
        dst_ptr += ntg.x*nb0;
    }
 }
 //============================================ k-quants ======================================================
 #ifndef QK_K
@ -1183,7 +1265,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
 //====================================== dot products =========================
-kernel void kernel_mul_mat_q2_K_f32(
+kernel void kernel_mul_mv_q2_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -1327,7 +1409,7 @@ kernel void kernel_mul_mat_q2_K_f32(
 }
 #if QK_K == 256
-kernel void kernel_mul_mat_q3_K_f32(
+kernel void kernel_mul_mv_q3_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -1479,7 +1561,7 @@ kernel void kernel_mul_mat_q3_K_f32(
    }
 }
 #else
-kernel void kernel_mul_mat_q3_K_f32(
+kernel void kernel_mul_mv_q3_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -1550,7 +1632,7 @@ kernel void kernel_mul_mat_q3_K_f32(
 #endif
 #if QK_K == 256
-kernel void kernel_mul_mat_q4_K_f32(
+kernel void kernel_mul_mv_q4_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -1656,7 +1738,7 @@ kernel void kernel_mul_mat_q4_K_f32(
    }
 }
 #else
-kernel void kernel_mul_mat_q4_K_f32(
+kernel void kernel_mul_mv_q4_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -1745,7 +1827,7 @@ kernel void kernel_mul_mat_q4_K_f32(
 }
 #endif
-kernel void kernel_mul_mat_q5_K_f32(
+kernel void kernel_mul_mv_q5_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -1918,7 +2000,7 @@ kernel void kernel_mul_mat_q5_K_f32(
 }
-kernel void kernel_mul_mat_q6_K_f32(
+kernel void kernel_mul_mv_q6_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@ -2256,7 +2338,7 @@ kernel void kernel_get_rows(
 }
 #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
-#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
+#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
 #define BLOCK_SIZE_K 32
 #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
 #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
@ -2293,9 +2375,11 @@ kernel void kernel_mul_mm(device const  uchar * src0,
    const uint r0 = tgpig.y;
    const uint r1 = tgpig.x;
    const uint im = tgpig.z;
    // if this block is of 64x32 shape or smaller
    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
    // a thread shouldn't load data outside of the matrix
    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
@ -2319,26 +2403,30 @@ kernel void kernel_mul_mm(device const  uchar * src0,
        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
-        //load data and store to threadgroup memory
+        // load data and store to threadgroup memory
        half4x4 temp_a;
        dequantize_func(x, il, temp_a);
        threadgroup_barrier(mem_flags::mem_threadgroup);
        #pragma unroll(16)
        for (int i = 0; i < 16; i++) {
            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
+            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
+            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
        }
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
+
-                = *((device float2x4 *)y);
+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
        il = (il + 2 < nl) ? il + 2 : il % 2;
        x  = (il < 2) ? x + (2+nl-1)/nl : x;
        y += BLOCK_SIZE_K;
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        //load matrices from threadgroup memory and conduct outer products
+
        // load matrices from threadgroup memory and conduct outer products
        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
        #pragma unroll(4)
        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
            #pragma unroll(4)
@ -2353,6 +2441,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
            #pragma unroll(8)
            for (int i = 0; i < 8; i++){
                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
@ -2361,25 +2450,26 @@ kernel void kernel_mul_mm(device const  uchar * src0,
    }
    if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
-        device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
+        device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
-                          + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0;
+                               + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
        for (int i = 0; i < 8; i++) {
            simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
        }
    } else {
        // block is smaller than 64x32, we should avoid writing data outside of the matrix
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
        for (int i = 0; i < 8; i++) {
            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
        }
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
+
-        if (sgitg==0) {
+        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
        if (sgitg == 0) {
            for (int i = 0; i < n_rows; i++) {
-                for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
+                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
                    *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
                }
            }
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -19,7 +19,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-#define CL_DMMV_BLOCK_SIZE 32
+#define CL_DMMV_LOCAL_SIZE 32
 #ifndef K_QUANTS_PER_ITERATION
 #define K_QUANTS_PER_ITERATION 1
@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
 __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
 {
-    const int i = get_group_id(0);
+    const int i = get_group_id(0) + get_global_offset(0);
    const int tid = get_local_id(0);
    const int n = tid / 32;
    const int l = tid - 32 * n;
    const int is = 8 * n + l / 16;
    const uint8_t q = x[i].qs[32 * n + l];
-    __global float *y = yy + i * QK_K + 128 * n;
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
    const float dall = vload_half(0, &x[i].d);
    const float dmin = vload_half(0, &x[i].dmin);
@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
 __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
 {
    int r = get_local_id(0) / 4;
-    int i = get_group_id(0);
+    int i = get_group_id(0) + get_global_offset(0);
    int tid = r / 2;
    int is0 = r % 2;
    int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
    float d_all = vload_half(0, &x[i].d);
    float dl = d_all * (us - 32);
-    __global float *y = yy + i * QK_K + 128 * n + 32 * j;
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
    const __global uint8_t *q = x[i].qs + 32 * n;
    const __global uint8_t *hm = x[i].hmask;
@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
 __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
 {
-    const int i = get_group_id(0);
+    const int i = get_group_id(0) + get_global_offset(0);
    const int tid = get_local_id(0);
    const int il = tid / 8;
    const int ir = tid % 8;
    const int is = 2 * il;
    const int n = 4;
-    __global float *y = yy + i * QK_K + 64 * il + n * ir;
+    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
    const float dall = vload_half(0, &x[i].d);
    const float dmin = vload_half(0, &x[i].dmin);
@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
 __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
 {
-    const int i = get_group_id(0);
+    const int i = get_group_id(0) + get_global_offset(0);
    const int tid = get_local_id(0);
    const int il = tid / 16;
    const int ir = tid % 16;
    const int is = 2 * il;
-    __global float *y = yy + i * QK_K + 64 * il + 2 * ir;
+    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
    const float dall = vload_half(0, &x[i].d);
    const float dmin = vload_half(0, &x[i].dmin);
@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
 __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
 {
-    const int i = get_group_id(0);
+    const int i = get_group_id(0) + get_global_offset(0);
    const int tid = get_local_id(0);
    const int ip = tid / 32;
    const int il = tid - 32 * ip;
    const int is = 8 * ip + il / 16;
-    __global float *y = yy + i * QK_K + 128 * ip + il;
+    __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
    const float d = vload_half(0, &x[i].d);
@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
    __global const struct block_q2_K * x = xx + ib0;
@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
    __global const struct block_q3_K * x = xx + ib0;
@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,
    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,
    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
    const int tid = get_local_id(0)/2;  // 0...15
    const int ix  = get_local_id(0)%2;
@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
+    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
    __global const struct block_q6_K * x = xx + ib0;
@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
    const uint qk = QUANT_K;
    const uint qr = QUANT_R;
-    const int ib = i/qk; // block index
+    const int ib = i/qk + get_global_offset(0); // block index
    const int iqs = (i%qk)/qr; // quant index
    const int iybs = i - i%qk; // y block start index
    const int y_offset = qr == 1 ? 1 : qk/2;
@ -745,19 +745,21 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
 std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int block_size = get_local_size(0);
+    const int local_size = get_local_size(0);
    const int row = get_group_id(0);
    const int tid = get_local_id(0);
    const uint qk = QUANT_K;
    const uint qr = QUANT_R;
    const int col_step = local_size * 2;
    const int y_offset = qr == 1 ? 1 : qk/2;
    x += get_global_offset(0);
    tmp[tid] = 0;
-    for (int i = 0; i < ncols/block_size; i += 2) {
+    for (int col = tid*2; col < ncols; col += col_step) {
        const int col = i*block_size + 2*tid;
        const int ib = (row*ncols + col)/qk; // block index
        const int iqs = (col%qk)/qr; // quant index
        const int iybs = col - col%qk; // y block start index
@ -773,7 +775,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
    // sum up partial sums and write back result
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=block_size/2; s>0; s>>=1) {
+    for (int s=local_size/2; s>0; s>>=1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
@ -1349,30 +1351,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
    const enum ggml_type type = src->type;
    const size_t ts = ggml_type_size(type);
    const size_t bs = ggml_blck_size(type);
    const uint64_t row_size = ts*ne0/bs;
-    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
+    const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == ts*ne0/bs) {
+    if (nb0 == ts && nb1 == row_size) {
-        err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
+        return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
        return err;
    }
    if (nb0 == ts) {
        const size_t buffer_origin[3] = { offset, 0, 0 };
        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { ts*ne0/bs, ne1, 1 };
+        const size_t region[3] = { row_size, ne1, 1 };
-        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
+        return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
        return err;
    }
    std::vector<cl_event> events;
    if (ev && ne1>1) events.reserve(ne1-1);
    for (uint64_t i1 = 0; i1 < ne1; i1++) {
        // pretend the row is a matrix with cols=1
-        const size_t buffer_origin[3] = { offset, i1*nb1, 0 };
+        const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
-        const size_t host_origin[3] = { 0, i1*ts*ne0/bs, 0 };
+        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { ts/bs, ne0, 1 };
+        const size_t region[3] = { ts, ne0/bs, 1 };
-        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
+        // if an event is requested, make the last write wait for all previous writes to complete
        if (ev && i1) {
            events.push_back(*ev);
        }
        cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
        if (err != CL_SUCCESS) {
-            break;
+            for (auto event : events) {
                clReleaseEvent(event);
            }
            return err;
        }
    }
-    return err;
+    for (auto event : events) {
        CL_CHECK(clReleaseEvent(event));
    }
    return CL_SUCCESS;
 }
 static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -1476,10 +1490,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    const int64_t r2 = ne12 / ne02;
    const int64_t r3 = ne13 / ne03;
    const float alpha = 1.0f;
    const float beta = 0.0f;
    const int x_ne = ne01 * ne00;
@ -1498,13 +1517,25 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
+    size_t x_offset = 0;
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
+    int64_t pi02 = -1;
    int64_t pi03 = -1;
    for (int64_t i13 = 0; i13 < ne13; i13++) {
        int64_t i03 = i13 / r3;
        for (int64_t i12 = 0; i12 < ne12; i12++) {
            int64_t i02 = i12 / r2;
            // copy data to device
-            if (src0->backend != GGML_BACKEND_GPU) {
+            if (src0->backend == GGML_BACKEND_GPU) {
                x_offset = (i03 * ne02 + i02) * x_ne;
            } else if (i02 != pi02 || i03 != pi03) {
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                pi02 = i02;
                pi03 = i03;
            }
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
            CL_CHECK(clFinish(queue));
@ -1514,7 +1545,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                       ne01, ne11, ne10,
                                                       alpha,
-                                                       d_X, 0, ne00,
+                                                       d_X, x_offset, ne00,
                                                       d_Y, 0, ne10,
                                                       beta,
                                                       d_D, 0, ne01,
@ -1525,7 +1556,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
            }
            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
        }
    }
@ -1547,6 +1578,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int nb10 = src1->nb[0];
    const int nb11 = src1->nb[1];
@ -1556,6 +1589,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    const int64_t r2 = ne12 / ne02;
    const int64_t r3 = ne13 / ne03;
    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
    const int x_ne = ne01 * ne00;
@ -1577,32 +1613,44 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    bool src1_cont_rows = nb10 == sizeof(float);
    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
+    size_t x_offset = 0;
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
+    int64_t pi02 = -1;
    int64_t pi03 = -1;
    for (int64_t i13 = 0; i13 < ne13; i13++) {
        int64_t i03 = i13 / r3;
        for (int64_t i12 = 0; i12 < ne12; i12++) {
            int64_t i02 = i12 / r2;
            // copy src0 to device
-            if (src0->backend != GGML_BACKEND_GPU) {
+            if (src0->backend == GGML_BACKEND_GPU) {
                x_offset = (i03 * ne02 + i02) * x_ne;
            } else if (i02 != pi02 || i03 != pi03) {
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                pi02 = i02;
                pi03 = i03;
            }
            // convert src1 to fp16
            // TODO: use multiple threads
-            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
-            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
+            char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
            if (src1_cont_rows) {
                if (src1_cont_cols) {
                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
                }
                else {
-                    for (int64_t i01 = 0; i01 < ne11; i01++) {
+                    for (int64_t i11 = 0; i11 < ne11; i11++) {
-                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+                        ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
                    }
                }
            }
            else {
-                for (int64_t i01 = 0; i01 < ne11; i01++) {
+                for (int64_t i11 = 0; i11 < ne11; i11++) {
-                    for (int64_t i00 = 0; i00 < ne10; i00++) {
+                    for (int64_t i10 = 0; i10 < ne10; i10++) {
                        // very slow due to no inlining
-                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+                        tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
                    }
                }
            }
@ -1618,7 +1666,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                       ne01, ne11, ne10,
                                                       alpha,
-                                                       d_X, 0, ne00,
+                                                       d_X, x_offset, ne00,
                                                       d_Y, 0, ne10,
                                                       beta,
                                                       d_D, 0, ne01,
@ -1631,7 +1679,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
            // copy dst to host, then convert to float
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
            ggml_fp16_to_fp32_row(tmp, d, d_ne);
        }
@ -1652,18 +1700,24 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    const int64_t ne12 = src1->ne[2];
    const int64_t ne13 = src1->ne[3];
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    const ggml_type type = src0->type;
-    const bool mul_mat_vec = ne11 == 1;
+    const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
    const int64_t r2 = ne12 / ne02;
    const int64_t r3 = ne13 / ne03;
    const float alpha = 1.0f;
    const float beta = 0.0f;
    const int x_ne = ne01 * ne00;
    const int y_ne = ne11 * ne10;
    const int d_ne = ne11 * ne01;
-    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
+    const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
    const size_t q_sz = ggml_type_size(type) * x_bps;
    size_t x_size;
    size_t y_size;
@ -1685,17 +1739,28 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    GGML_ASSERT(to_fp32_cl != nullptr);
    const size_t global_denom = ggml_cl_global_denom(type);
-    const size_t local = ggml_cl_local_size(type);
+    const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
    size_t ev_idx = 0;
    std::vector<cl_event> events;
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
+    int64_t pi02 = -1;
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
+    int64_t pi03 = -1;
    for (int64_t i13 = 0; i13 < ne13; i13++) {
        int64_t i03 = i13 / r3;
        for (int64_t i12 = 0; i12 < ne12; i12++) {
            int64_t i02 = i12 / r2;
            // copy src0 to device if necessary
            if (src0->backend == GGML_BACKEND_CPU) {
-                events.emplace_back();
+                if (i02 != pi02 || i03 != pi03) {
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
+                    events.emplace_back();
                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
                    pi02 = i02;
                    pi03 = i03;
                }
            } else if (src0->backend == GGML_BACKEND_GPU) {
                d_Q = (cl_mem) src0->extra;
            } else {
@ -1704,11 +1769,11 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
                // copy src1 to device
                events.emplace_back();
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
                // compute
-                const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
+                const size_t global = ne01 * local;
-                const size_t local = CL_DMMV_BLOCK_SIZE;
+                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                const cl_int ncols = ne00;
                events.emplace_back();
                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@ -1716,16 +1781,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
                // convert src0 to fp32 on device
                const size_t global = x_ne / global_denom;
                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
                // copy src1 to device
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
                events.emplace_back();
@ -1749,7 +1815,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
            }
            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
            for (auto *event : events) {
                clReleaseEvent(event);
@ -1844,17 +1910,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
    const int64_t ne3 = tensor->ne[3];
    const ggml_type type = tensor->type;
-    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+    const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
    const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
    size_t q_size;
    cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
    tensor->data = data;
    // copy tensor to device
    size_t offset = 0;
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            int i = i3*ne2 + i2;
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
+            offset += s_sz;
        }
    }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -326,7 +326,7 @@ extern "C" {
        GGML_TYPE_COUNT,
    };
-    enum ggml_backend {
+    enum ggml_backend_type {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
        GGML_BACKEND_GPU_SPLIT = 20,
@ -401,10 +401,14 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_CONV_1D_STAGE_0,  // internal
        GGML_OP_CONV_1D_STAGE_1,  // internal
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_FLASH_ATTN,
@ -475,8 +479,10 @@ extern "C" {
    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type    type;
+        enum ggml_type         type;
-        enum ggml_backend backend;
+        enum ggml_backend_type backend;
        struct ggml_backend_buffer * buffer;
        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
@ -510,7 +516,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[4];
+        char padding[12];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -698,6 +704,9 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
    // Context tensor enumeration and lookup
    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@ -1354,7 +1363,7 @@ extern "C" {
    // alibi position embedding
    // in-place, returns view(a)
-    struct ggml_tensor * ggml_alibi(
+    GGML_API struct ggml_tensor * ggml_alibi(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
@ -1363,7 +1372,7 @@ extern "C" {
    // clamp
    // in-place, returns view(a)
-    struct ggml_tensor * ggml_clamp(
+    GGML_API struct ggml_tensor * ggml_clamp(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            float                 min,
@ -1386,6 +1395,14 @@ extern "C" {
            int                   s,
            int                   d);
    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            int                   s0,
            int                   p0,
            int                   d0);
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1759,6 +1776,7 @@ extern "C" {
        GGML_OPT_NO_CONTEXT,
        GGML_OPT_INVALID_WOLFE,
        GGML_OPT_FAIL,
        GGML_OPT_CANCEL,
        GGML_LINESEARCH_FAIL = -128,
        GGML_LINESEARCH_MINIMUM_STEP,
@ -2089,7 +2107,7 @@ extern "C" {
        enum ggml_type    vec_dot_type;
    } ggml_type_traits_t;
-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 #ifdef  __cplusplus
 }
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@ -69,4 +69,3 @@ python -m twine upload dist/*
 ## TODO
 - [ ] Add tests
 - [ ] Include conversion scripts as command line entry points in this package.
 - Add CI workflow for releasing the package.
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -85,26 +85,34 @@ class MODEL_ARCH(IntEnum):
    GPTNEOX       : int = auto()
    MPT           : int = auto()
    STARCODER     : int = auto()
    PERSIMMON     : int = auto()
    REFACT        : int = auto()
    BERT          : int = auto()
    BLOOM         : int = auto()
 class MODEL_TENSOR(IntEnum):
-    TOKEN_EMBD   : int = auto()
+    TOKEN_EMBD      : int = auto()
-    POS_EMBD     : int = auto()
+    TOKEN_EMBD_NORM : int = auto()
-    OUTPUT       : int = auto()
+    TOKEN_TYPES     : int = auto()
-    OUTPUT_NORM  : int = auto()
+    POS_EMBD        : int = auto()
-    ROPE_FREQS   : int = auto()
+    OUTPUT          : int = auto()
-    ATTN_Q       : int = auto()
+    OUTPUT_NORM     : int = auto()
-    ATTN_K       : int = auto()
+    ROPE_FREQS      : int = auto()
-    ATTN_V       : int = auto()
+    ATTN_Q          : int = auto()
-    ATTN_QKV     : int = auto()
+    ATTN_K          : int = auto()
-    ATTN_OUT     : int = auto()
+    ATTN_V          : int = auto()
-    ATTN_NORM    : int = auto()
+    ATTN_QKV        : int = auto()
-    ATTN_NORM_2  : int = auto()
+    ATTN_OUT        : int = auto()
-    ATTN_ROT_EMBD: int = auto()
+    ATTN_NORM       : int = auto()
-    FFN_GATE     : int = auto()
+    ATTN_NORM_2     : int = auto()
-    FFN_DOWN     : int = auto()
+    ATTN_ROT_EMBD   : int = auto()
-    FFN_UP       : int = auto()
+    FFN_GATE        : int = auto()
-    FFN_NORM     : int = auto()
+    FFN_DOWN        : int = auto()
    FFN_UP          : int = auto()
    FFN_NORM        : int = auto()
    ATTN_Q_NORM     : int = auto()
    ATTN_K_NORM     : int = auto()
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -116,78 +124,183 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.GPTNEOX:        "gptneox",
    MODEL_ARCH.MPT:            "mpt",
    MODEL_ARCH.STARCODER:      "starcoder",
    MODEL_ARCH.PERSIMMON:      "persimmon",
    MODEL_ARCH.REFACT:         "refact",
    MODEL_ARCH.BERT:           "bert",
    MODEL_ARCH.BLOOM:          "bloom",
 }
-MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
+TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
-    MODEL_ARCH.LLAMA: {
+    MODEL_TENSOR.TOKEN_EMBD:      "token_embd",
-        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+    MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
-        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+    MODEL_TENSOR.TOKEN_TYPES:     "token_types",
-        MODEL_TENSOR.OUTPUT:        "output",
+    MODEL_TENSOR.POS_EMBD:        "position_embd",
-        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+    MODEL_TENSOR.OUTPUT_NORM:     "output_norm",
-        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+    MODEL_TENSOR.OUTPUT:          "output",
-        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+    MODEL_TENSOR.ROPE_FREQS:      "rope_freqs",
-        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+    MODEL_TENSOR.ATTN_NORM:       "blk.{bid}.attn_norm",
-        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+    MODEL_TENSOR.ATTN_NORM_2:     "blk.{bid}.attn_norm_2",
-        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+    MODEL_TENSOR.ATTN_QKV:        "blk.{bid}.attn_qkv",
-        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+    MODEL_TENSOR.ATTN_Q:          "blk.{bid}.attn_q",
-        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.ATTN_K:          "blk.{bid}.attn_k",
-        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+    MODEL_TENSOR.ATTN_V:          "blk.{bid}.attn_v",
-        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+    MODEL_TENSOR.ATTN_OUT:        "blk.{bid}.attn_output",
-        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
-    },
+    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
-    MODEL_ARCH.GPTNEOX: {
+    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
-        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
-        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
-        MODEL_TENSOR.OUTPUT:        "output",
+    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
-        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
-        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+}
-        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+
-        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
-        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+    MODEL_ARCH.LLAMA: [
-        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+        MODEL_TENSOR.TOKEN_EMBD,
-    },
+        MODEL_TENSOR.OUTPUT_NORM,
-    MODEL_ARCH.FALCON: {
+        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
+        MODEL_TENSOR.ROPE_FREQS,
-        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
+        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.OUTPUT:      "output",
+        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
+        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
+        MODEL_TENSOR.ATTN_ROT_EMBD,
-        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
+        MODEL_TENSOR.FFN_GATE,
-    },
+        MODEL_TENSOR.FFN_DOWN,
-    MODEL_ARCH.BAICHUAN: {
+        MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+    ],
-        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+    MODEL_ARCH.GPTNEOX: [
-        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+    ],
-        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+    MODEL_ARCH.FALCON: [
-        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+        MODEL_TENSOR.TOKEN_EMBD,
-    },
+        MODEL_TENSOR.OUTPUT_NORM,
-    MODEL_ARCH.STARCODER: {
+        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.POS_EMBD:      "position_embd",
+        MODEL_TENSOR.ATTN_NORM_2,
-        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+    ],
-        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+    MODEL_ARCH.BAICHUAN: [
-        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+        MODEL_TENSOR.OUTPUT_NORM,
-    },
+        MODEL_TENSOR.OUTPUT,
-    MODEL_ARCH.GPT2: {
+        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_ROT_EMBD,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.STARCODER: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.BERT: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_TYPES,
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.MPT: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.GPTJ: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.PERSIMMON: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.ATTN_Q_NORM,
        MODEL_TENSOR.ATTN_K_NORM,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
    MODEL_ARCH.REFACT: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.BLOOM: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_EMBD_NORM,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.GPT2: [
        # TODO
-    },
+    ],
    # TODO
 }
@ -201,6 +314,9 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
    MODEL_ARCH.PERSIMMON: [
        MODEL_TENSOR.ROPE_FREQS,
    ]
 }
@ -208,31 +324,50 @@ class TensorNameMap:
    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
        # Token embeddings
        MODEL_TENSOR.TOKEN_EMBD: (
-            "gpt_neox.embed_in",           # gptneox
+            "gpt_neox.embed_in",                        # gptneox
-            "transformer.wte",             # gpt2 mpt
+            "transformer.wte",                          # gpt2 gpt-j mpt refact
-            "transformer.word_embeddings", # falcon
+            "transformer.word_embeddings",              # falcon
-            "model.embed_tokens",          # llama-hf
+            "word_embeddings",                          # bloom
-            "tok_embeddings",              # llama-pth
+            "model.embed_tokens",                       # llama-hf
            "tok_embeddings",                           # llama-pth
            "embeddings.word_embeddings",               # bert
            "language_model.embedding.word_embeddings", # persimmon
        ),
        # Token type embeddings
        MODEL_TENSOR.TOKEN_TYPES: (
            "embeddings.token_type_embeddings",  # bert
        ),
        # Normalization of token embeddings
        MODEL_TENSOR.TOKEN_EMBD_NORM: (
            "word_embeddings_layernorm",  # bloom
        ),
        # Position embeddings
        MODEL_TENSOR.POS_EMBD: (
-            "transformer.wpe", # gpt2
+            "transformer.wpe",                 # gpt2
            "embeddings.position_embeddings",  # bert
        ),
        # Output
        MODEL_TENSOR.OUTPUT: (
-            "embed_out", # gptneox
+            "embed_out",                # gptneox
-            "lm_head",   # gpt2 mpt falcon llama-hf baichuan
+            "lm_head",                  # gpt2 mpt falcon llama-hf baichuan
-            "output",    # llama-pth
+            "output",                   # llama-pth bloom
            "word_embeddings_for_head", # persimmon
        ),
        # Output norm
        MODEL_TENSOR.OUTPUT_NORM: (
-            "gpt_neox.final_layer_norm", # gptneox
+            "gpt_neox.final_layer_norm",              # gptneox
-            "transformer.ln_f",          # gpt2 falcon
+            "transformer.ln_f",                       # gpt2 gpt-j falcon
-            "model.norm",                # llama-hf baichuan
+            "model.norm",                             # llama-hf baichuan
-            "norm",                      # llama-pth
+            "norm",                                   # llama-pth
            "embeddings.LayerNorm",                   # bert
            "transformer.norm_f",                     # mpt
            "ln_f",                                   # refact bloom
            "language_model.encoder.final_layernorm", # persimmon
        ),
        # Rope frequencies
@ -244,13 +379,16 @@ class TensorNameMap:
    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
        # Attention norm
        MODEL_TENSOR.ATTN_NORM: (
-            "gpt_neox.layers.{bid}.input_layernorm", # gptneox
+            "gpt_neox.layers.{bid}.input_layernorm",               # gptneox
-            "transformer.h.{bid}.ln_1",              # gpt2
+            "transformer.h.{bid}.ln_1",                            # gpt2 gpt-j refact
-            "transformer.blocks.{bid}.norm_1",       # mpt
+            "transformer.blocks.{bid}.norm_1",                     # mpt
-            "transformer.h.{bid}.input_layernorm",   # falcon7b
+            "transformer.h.{bid}.input_layernorm",                 # falcon7b
-            "transformer.h.{bid}.ln_mlp",            # falcon40b
+            "h.{bid}.input_layernorm",                             # bloom
-            "model.layers.{bid}.input_layernorm",    # llama-hf
+            "transformer.h.{bid}.ln_mlp",                          # falcon40b
-            "layers.{bid}.attention_norm",           # llama-pth
+            "model.layers.{bid}.input_layernorm",                  # llama-hf
            "layers.{bid}.attention_norm",                         # llama-pth
            "encoder.layer.{bid}.attention.output.LayerNorm",      # bert
            "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
        ),
        # Attention norm 2
@ -260,38 +398,50 @@ class TensorNameMap:
        # Attention query-key-value
        MODEL_TENSOR.ATTN_QKV: (
-            "gpt_neox.layers.{bid}.attention.query_key_value",    # gptneox
+            "gpt_neox.layers.{bid}.attention.query_key_value",                    # gptneox
-            "transformer.h.{bid}.attn.c_attn",                    # gpt2
+            "transformer.h.{bid}.attn.c_attn",                                    # gpt2
-            "transformer.blocks.{bid}.attn.Wqkv",                 # mpt
+            "transformer.blocks.{bid}.attn.Wqkv",                                 # mpt
-            "transformer.h.{bid}.self_attention.query_key_value", # falcon
+            "transformer.h.{bid}.self_attention.query_key_value",                 # falcon
            "h.{bid}.self_attention.query_key_value",                             # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
        ),
        # Attention query
        MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj", # llama-hf
+            "model.layers.{bid}.self_attn.q_proj",       # llama-hf
-            "layers.{bid}.attention.wq",           # llama-pth
+            "layers.{bid}.attention.wq",                 # llama-pth
            "encoder.layer.{bid}.attention.self.query",  # bert
            "transformer.h.{bid}.attn.q_proj",           # gpt-j
        ),
        # Attention key
        MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj", # llama-hf
+            "model.layers.{bid}.self_attn.k_proj",     # llama-hf
-            "layers.{bid}.attention.wk",           # llama-pth
+            "layers.{bid}.attention.wk",               # llama-pth
            "encoder.layer.{bid}.attention.self.key",  # bert
            "transformer.h.{bid}.attn.k_proj",         # gpt-j
        ),
        # Attention value
        MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj", # llama-hf
+            "model.layers.{bid}.self_attn.v_proj",       # llama-hf
-            "layers.{bid}.attention.wv",           # llama-pth
+            "layers.{bid}.attention.wv",                 # llama-pth
            "encoder.layer.{bid}.attention.self.value",  # bert
            "transformer.h.{bid}.attn.v_proj",           # gpt-j
        ),
        # Attention output
        MODEL_TENSOR.ATTN_OUT: (
-            "gpt_neox.layers.{bid}.attention.dense",    # gptneox
+            "gpt_neox.layers.{bid}.attention.dense",                   # gptneox
-            "transformer.h.{bid}.attn.c_proj",          # gpt2
+            "transformer.h.{bid}.attn.c_proj",                         # gpt2 refact
-            "transformer.blocks.{bid}.attn.out_proj",   # mpt
+            "transformer.blocks.{bid}.attn.out_proj",                  # mpt
-            "transformer.h.{bid}.self_attention.dense", # falcon
+            "transformer.h.{bid}.self_attention.dense",                # falcon
-            "model.layers.{bid}.self_attn.o_proj",      # llama-hf
+            "h.{bid}.self_attention.dense",                            # bloom
-            "layers.{bid}.attention.wo",                # llama-pth
+            "model.layers.{bid}.self_attn.o_proj",                     # llama-hf
            "layers.{bid}.attention.wo",                               # llama-pth
            "encoder.layer.{bid}.attention.output.dense",              # bert
            "transformer.h.{bid}.attn.out_proj",                       # gpt-j
            "language_model.encoder.layers.{bid}.self_attention.dense" # persimmon
        ),
        # Rotary embeddings
@ -302,64 +452,83 @@ class TensorNameMap:
        # Feed-forward norm
        MODEL_TENSOR.FFN_NORM: (
-            "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
+            "gpt_neox.layers.{bid}.post_attention_layernorm",               # gptneox
-            "transformer.h.{bid}.ln_2",                       # gpt2
+            "transformer.h.{bid}.ln_2",                                     # gpt2 refact
-            "transformer.blocks.{bid}.norm_2",                # mpt
+            "h.{bid}.post_attention_layernorm",                             # bloom
-            "model.layers.{bid}.post_attention_layernorm",    # llama-hf
+            "transformer.blocks.{bid}.norm_2",                              # mpt
-            "layers.{bid}.ffn_norm",                          # llama-pth
+            "model.layers.{bid}.post_attention_layernorm",                  # llama-hf
            "layers.{bid}.ffn_norm",                                        # llama-pth
            "encoder.layer.{bid}.output.LayerNorm",                         # bert
            "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
        ),
        # Feed-forward up
        MODEL_TENSOR.FFN_UP: (
-            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
+            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",               # gptneox
-            "transformer.h.{bid}.mlp.c_fc",            # gpt2
+            "transformer.h.{bid}.mlp.c_fc",                          # gpt2
-            "transformer.blocks.{bid}.ffn.up_proj",    # mpt
+            "transformer.blocks.{bid}.ffn.up_proj",                  # mpt
-            "transformer.h.{bid}.mlp.dense_h_to_4h",   # falcon
+            "transformer.h.{bid}.mlp.dense_h_to_4h",                 # falcon
-            "model.layers.{bid}.mlp.up_proj",          # llama-hf
+            "h.{bid}.mlp.dense_h_to_4h",                             # bloom
-            "layers.{bid}.feed_forward.w3",            # llama-pth
+            "model.layers.{bid}.mlp.up_proj",                        # llama-hf refact
            "layers.{bid}.feed_forward.w3",                          # llama-pth
            "encoder.layer.{bid}.intermediate.dense",                # bert
            "transformer.h.{bid}.mlp.fc_in",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
        ),
        # Feed-forward gate
        MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj", # llama-hf
+            "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
            "layers.{bid}.feed_forward.w1",     # llama-pth
        ),
        # Feed-forward down
        MODEL_TENSOR.FFN_DOWN: (
-            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
+            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",               # gptneox
-            "transformer.h.{bid}.mlp.c_proj",          # gpt2
+            "transformer.h.{bid}.mlp.c_proj",                        # gpt2 refact
-            "transformer.blocks.{bid}.ffn.down_proj",  # mpt
+            "transformer.blocks.{bid}.ffn.down_proj",                # mpt
-            "transformer.h.{bid}.mlp.dense_4h_to_h",   # falcon
+            "transformer.h.{bid}.mlp.dense_4h_to_h",                 # falcon
-            "model.layers.{bid}.mlp.down_proj",        # llama-hf
+            "h.{bid}.mlp.dense_4h_to_h",                             # bloom
-            "layers.{bid}.feed_forward.w2",            # llama-pth
+            "model.layers.{bid}.mlp.down_proj",                      # llama-hf
            "layers.{bid}.feed_forward.w2",                          # llama-pth
            "encoder.layer.{bid}.output.dense",                      # bert
            "transformer.h.{bid}.mlp.fc_out",                        # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
        ),
        MODEL_TENSOR.ATTN_Q_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
        ),
        MODEL_TENSOR.ATTN_K_NORM: (
            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
        ),
        MODEL_TENSOR.ROPE_FREQS: (
            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
        )
    }
    mapping: dict[str, tuple[MODEL_TENSOR, str]]
    tensor_names: dict[MODEL_TENSOR, str]
    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
-        mapping = self.mapping = {}
+        self.mapping = {}
        tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
        for tensor, keys in self.mappings_cfg.items():
-            tensor_name = tensor_names.get(tensor)
+            if tensor not in MODEL_TENSORS[arch]:
            if tensor_name is None:
                continue
-            mapping[tensor_name] = (tensor, tensor_name)
+            tensor_name = TENSOR_NAMES[tensor]
            self.mapping[tensor_name] = (tensor, tensor_name)
            for key in keys:
-                mapping[key] = (tensor, tensor_name)
+                self.mapping[key] = (tensor, tensor_name)
        for bid in range(n_blocks):
            for tensor, keys in self.block_mappings_cfg.items():
-                tensor_name = tensor_names.get(tensor)
+                if tensor not in MODEL_TENSORS[arch]:
                if tensor_name is None:
                    continue
-                tensor_name = tensor_name.format(bid = bid)
+                tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
-                mapping[tensor_name] = (tensor, tensor_name)
+                self.mapping[tensor_name] = (tensor, tensor_name)
                for key in keys:
                    key = key.format(bid = bid)
-                    mapping[key] = (tensor, tensor_name)
+                    self.mapping[key] = (tensor, tensor_name)
    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
        result = self.mapping.get(key)
@ -800,22 +969,25 @@ class SpecialVocab:
    special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
    special_token_ids: dict[str, int] = {}
-    def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
+    def __init__(
        self, path: str | os.PathLike[str], load_merges: bool = False,
        special_token_types: tuple[str, ...] | None = None,
    ):
        self.special_token_ids = {}
        self.load_merges = load_merges
        if special_token_types is not None:
            self.special_token_types = special_token_types
-        self.load(path)
+        self._load(Path(path))
-    def load(self, path: Path):
+    def _load(self, path: Path) -> None:
-        if not self.try_load_from_tokenizer_json(path):
+        if not self._try_load_from_tokenizer_json(path):
-            self.try_load_from_config_json(path)
+            self._try_load_from_config_json(path)
-    def try_load_from_tokenizer_json(self, path: Path) -> bool:
+    def _try_load_from_tokenizer_json(self, path: Path) -> bool:
        tokenizer_file = path / 'tokenizer.json'
        if not tokenizer_file.is_file():
            return False
-        with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
+        with open(tokenizer_file, encoding = 'utf-8') as f:
            tokenizer = json.load(f)
        if self.load_merges:
            merges = tokenizer.get('model', {}).get('merges')
@ -825,7 +997,7 @@ class SpecialVocab:
        added_tokens = tokenizer.get('added_tokens')
        if added_tokens is None or not tokenizer_config_file.is_file():
            return True
-        with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
+        with open(tokenizer_config_file, encoding = 'utf-8') as f:
            tokenizer_config = json.load(f)
        for typ in self.special_token_types:
            entry = tokenizer_config.get(f'{typ}_token')
@ -844,11 +1016,11 @@ class SpecialVocab:
                break
        return True
-    def try_load_from_config_json(self, path: Path) -> bool:
+    def _try_load_from_config_json(self, path: Path) -> bool:
        config_file = path / 'config.json'
        if not config_file.is_file():
            return False
-        with open(config_file, 'r', encoding = 'utf-8') as f:
+        with open(config_file, encoding = 'utf-8') as f:
            config = json.load(f)
        for typ in self.special_token_types:
            maybe_token_id = config.get(f'{typ}_token_id')
@ -856,7 +1028,7 @@ class SpecialVocab:
                self.special_token_ids[typ] = maybe_token_id
        return True
-    def add_to_gguf(self, gw: GGUFWriter):
+    def add_to_gguf(self, gw: GGUFWriter) -> None:
        if len(self.merges) > 0:
            print(f'gguf: Adding {len(self.merges)} merge(s).')
            gw.add_token_merges(self.merges)
@ -868,8 +1040,8 @@ class SpecialVocab:
            print(f'gguf: Setting special token type {typ} to {tokid}')
            handler(tokid)
-    def __repr__(self):
+    def __repr__(self) -> str:
-        return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
+        return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids or "unset"}>'
 # Example usage:
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.3.3"
+version = "0.4.4"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/k_quants.c
+++ b/k_quants.c
@ -54,6 +54,10 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 #endif
 #endif
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
 #endif
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@ -65,7 +69,6 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 // 2-6 bit quantization in super-blocks
 //
 //
 // ===================== Helper functions
 //
@ -344,7 +347,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
    const float q4scale = 15.f;
    for (int i = 0; i < nb; i++) {
        float max_scale = 0; // as we are deducting the min, scales are always positive
        float max_min = 0;
        for (int j = 0; j < QK_K/16; ++j) {
@ -1582,6 +1584,90 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __riscv_v_intrinsic
    float sumf = 0;
    uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
    for (int i = 0; i < nb; ++i) {
        const uint8_t * q2 = x[i].qs;
        const  int8_t * q8 = y[i].qs;
        const uint8_t * sc = x[i].scales;
        const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
        const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
        size_t vl = 16;
        vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
        vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
        vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
        vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
        vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
        vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
        vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
        sumf  += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
        vl = 32;
        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
        vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
        uint8_t is=0;
        int isum=0;
        for (int j = 0; j < QK_K/128; ++j) {
            // load Q2
            vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
            vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
            vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
            vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
            vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
            // duplicate scale elements for product
            vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
            vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
            vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
            vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
            vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
            vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
            vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
            vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
            // load Q8
            vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
            vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
            vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
            vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
            vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
            vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
            vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
            vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
            vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
            vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
            isum += __riscv_vmv_x_s_i32m1_i32(isum1);
            q2+=32;  q8+=128;  is=8;
        }
        sumf += dall * isum;
    }
    *s = sumf;
 #else
    float sumf = 0;
@ -1807,6 +1893,64 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc) + summs;
 #elif defined __riscv_v_intrinsic
    uint32_t aux32[2];
    const uint8_t * scales = (const uint8_t *)aux32;
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const float d = y[i].d * (float)x[i].d;
        const float dmin = -y[i].d * (float)x[i].dmin;
        const uint8_t * restrict q2 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
        aux32[0] = sc[0] & 0x0f0f0f0f;
        aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
        sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
        int isum1 = 0;
        int isum2 = 0;
        size_t vl = 16;
        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
        // load Q2
        vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
        vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
        vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
        vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
        vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
        // load Q8, and take product with Q2
        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
        vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
        vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
        vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
        vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
        isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
        isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
        isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
        isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
        sumf += d * (isum1 + isum2);
    }
    *s = sumf;
 #else
    float sumf = 0;
@ -2220,6 +2364,106 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __riscv_v_intrinsic
    uint32_t aux[3];
    uint32_t utmp[4];
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const uint8_t * restrict q3 = x[i].qs;
        const uint8_t * restrict qh = x[i].hmask;
        const  int8_t * restrict q8 = y[i].qs;
        memcpy(aux, x[i].scales, 12);
        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
        int8_t * scale = (int8_t *)utmp;
        for (int j = 0; j < 16; ++j) scale[j] -= 32;
        size_t vl = 32;
        uint8_t m =  1;
        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
        vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
        int sum_t = 0;
        for (int j = 0; j < QK_K; j += 128) {
            vl = 32;
            // load Q3
            vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
            vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
            vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
            vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
            vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
            // compute mask for subtraction
            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
            m <<= 1;
            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
            m <<= 1;
            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
            m <<= 1;
            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
            m <<= 1;
            // load Q8 and take product with Q3
            vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
            vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
            vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
            vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
            vl = 16;
            // retreive lane to multiply with scale
            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
            vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
            vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
            vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
            vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
            vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
            sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
            q3 += 32;    q8 += 128;   scale += 8;
        }
        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
        sumf += d*sum_t;
    }
    *s = sumf;
 #else
    // scalar version
    // This function is written like this so the compiler can manage to vectorize most of it
@ -2523,6 +2767,79 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __riscv_v_intrinsic
    uint16_t aux16[2];
    int8_t * scales = (int8_t *)aux16;
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const uint8_t * restrict q3 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        const uint16_t a = *(const uint16_t *)x[i].scales;
        aux16[0] = a & 0x0f0f;
        aux16[1] = (a >> 4) & 0x0f0f;
        for (int j = 0; j < 4; ++j) scales[j] -= 8;
        int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
        const float d = y[i].d * (float)x[i].d;
        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
        // load qh
        vuint8mf4_t qh_x1   = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
        vuint8mf2_t qh_x2   = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
        size_t vl = 16;
        // extend and combine both qh_x1 and qh_x2
        vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
        vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
        vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
        vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
        vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
        // load Q3
        vuint8mf2_t q3_x  = __riscv_vle8_v_u8mf2(q3, vl);
        vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
        vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
        vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
        vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
        vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
        vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
        vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
        vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
        // load Q8 and take product with Q3
        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
        isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
        isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
        isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
        isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
        sumf += d * isum;
    }
    *s = sumf;
 #else
    int8_t  aux8[QK_K];
@ -2823,6 +3140,78 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
 #elif defined __riscv_v_intrinsic
    const uint8_t * scales = (const uint8_t*)&utmp[0];
    const uint8_t * mins   = (const uint8_t*)&utmp[2];
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        size_t vl = 8;
        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
        const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
        vint16mf2_t q8sums   = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
        memcpy(utmp, x[i].scales, 12);
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
        const uint32_t uaux = utmp[1] & kmask1;
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
        utmp[2] = uaux;
        utmp[0] &= kmask1;
        vuint8mf4_t mins8  = __riscv_vle8_v_u8mf4(mins, vl);
        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
        vint32m1_t  prod   = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
        const uint8_t * restrict q4 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;
        vl = 32;
        int32_t sum_1 = 0;
        int32_t sum_2 = 0;
        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
        for (int j = 0; j < QK_K/64; ++j) {
            // load Q4
            vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
            // load Q8 and multiply it with lower Q4 nibble
            vint8m1_t  q8_0 = __riscv_vle8_v_i8m1(q8, vl);
            vint8m1_t  q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
            vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
            vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
            sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
            // load Q8 and multiply it with upper Q4 nibble
            vint8m1_t  q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
            vint8m1_t  q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
            vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
            vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
            sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
            q4 += 32;    q8 += 64;
        }
        sumf += d*(sum_1 + sum_2);
    }
    *s = sumf;
 #else
@ -3064,6 +3453,50 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc) - summs;
 #elif defined __riscv_v_intrinsic
    uint16_t s16[2];
    const uint8_t * restrict scales = (const uint8_t *)s16;
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const uint8_t * restrict q4 = x[i].qs;
        const  int8_t * restrict q8 = y[i].qs;
        const uint16_t * restrict b = (const uint16_t *)x[i].scales;
        s16[0] = b[0] & 0x0f0f;
        s16[1] = (b[0] >> 4) & 0x0f0f;
        sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
        const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
        size_t vl = 32;
        vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
        // load Q4
        vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
        // load Q8 and multiply it with lower Q4 nibble
        vint8m1_t  q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
        vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
        vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
        sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
        // load Q8 and multiply it with upper Q4 nibble
        vint8m1_t  q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
        vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
        vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
        sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
    }
    *s = sumf;
 #else
    uint8_t aux8[QK_K];
@ -3394,6 +3827,93 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc) + summs;
 #elif defined __riscv_v_intrinsic
    const uint8_t * scales = (const uint8_t*)&utmp[0];
    const uint8_t * mins   = (const uint8_t*)&utmp[2];
    float sumf = 0;
    float sums = 0.0;
    size_t vl;
    for (int i = 0; i < nb; ++i) {
        vl = 8;
        const uint8_t * restrict q5 = x[i].qs;
        const uint8_t * restrict hm = x[i].qh;
        const  int8_t * restrict q8 = y[i].qs;
        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
        const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
        vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
        vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
        vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
        memcpy(utmp, x[i].scales, 12);
        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
        const uint32_t uaux = utmp[1] & kmask1;
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
        utmp[2] = uaux;
        utmp[0] &= kmask1;
        vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
        vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
        vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
        vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
        sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
        vl = 32;
        int32_t aux32 = 0;
        int is = 0;
        uint8_t m = 1;
        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
        vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
        for (int j = 0; j < QK_K/64; ++j) {
            // load Q5 and Q8
            vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
            vint8m1_t  q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
            vint8m1_t  q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
            // compute mask for addition
            vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
            vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
            m <<= 1;
            vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
            vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
            vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
            m <<= 1;
            vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
            vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
            vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
            vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
            vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
            vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
            aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
            q5 += 32;    q8 += 64;
        }
        vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
        sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
    }
    *s = sumf+sums;
 #else
    const uint8_t * scales = (const uint8_t*)&utmp[0];
@ -3639,6 +4159,76 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __riscv_v_intrinsic
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const float d = y[i].d * (float)x[i].d;
        const int8_t * sc = x[i].scales;
        const uint8_t * restrict q5 = x[i].qs;
        const uint8_t * restrict qh = x[i].qh;
        const int8_t  * restrict q8 = y[i].qs;
        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
        // load qh
        vuint8mf4_t qh_x1   = __riscv_vle8_v_u8mf4(qh, 8);
        vuint8mf2_t qh_x2   = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
        size_t vl = 16;
        // combine both qh_1 and qh_2
        vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
        vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
        vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
        vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
        vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
        vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
        vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
        vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
        vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
        // load q5
        vuint8mf2_t q5_x1  = __riscv_vle8_v_u8mf2(q5, vl);
        vuint8mf2_t q5_x2  = __riscv_vle8_v_u8mf2(q5+16, vl);
        vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
        vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
        vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
        vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
        vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
        vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
        vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
        vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
        // load Q8 and multiply it with Q5
        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
        int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
        int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
        int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
        int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
    }
    *s = sumf;
 #else
    int8_t aux8[QK_K];
@ -4023,6 +4613,91 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __riscv_v_intrinsic
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
        const uint8_t * restrict q6 = x[i].ql;
        const uint8_t * restrict qh = x[i].qh;
        const  int8_t * restrict q8 = y[i].qs;
        const int8_t * restrict scale = x[i].scales;
        size_t vl;
        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
        int sum_t = 0;
        int is = 0;
        for (int j = 0; j < QK_K/128; ++j) {
            vl = 32;
            // load qh
            vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
            // load Q6
            vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
            vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
            vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
            vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
            vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
            vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
            vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
            vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
            vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
            vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
            vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
            vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
            vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
            vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
            vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
            vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
            vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
            vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
            // load Q8 and take product
            vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
            vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
            vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
            vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
            vl = 16;
            vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
            vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
            vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
            vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
            vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
            vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
            vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
            vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
            sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
            q6 += 64;   qh += 32;   q8 += 128;   is=8;
        }
        sumf += d * sum_t;
    }
    *s = sumf;
 #else
    int8_t  aux8[QK_K];
@ -4276,6 +4951,73 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
    *s = hsum_float_8(acc);
 #elif defined __riscv_v_intrinsic
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const float d_all = (float)x[i].d;
        const uint8_t * restrict q6 = x[i].ql;
        const uint8_t * restrict qh = x[i].qh;
        const int8_t  * restrict q8 = y[i].qs;
        const int8_t * restrict scale = x[i].scales;
        int32_t isum = 0;
        size_t vl = 16;
        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
        // load Q6
        vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
        vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
        // load qh
        vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
        vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
        vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
        vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
        qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
        vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
        vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
        vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
        vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
        vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
        vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
        vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
        vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
        vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
        // load Q8 and take product
        vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
        vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
        vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
        vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
        vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
        vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
        vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
        vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
        isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
        isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
        isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
        isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
        sumf += isum * d_all * y[i].d;
    }
    *s = sumf;
 #else
    int8_t  aux8[QK_K];
--- a/k_quants.h
+++ b/k_quants.h
@ -29,7 +29,7 @@
 // 2-bit quantization
 // weight is represented as x = a * q + b
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 2.5625 bits per weight
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w
 // 3-bit quantization
 // weight is represented as x = a * q
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 3.4375 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 +
 #endif
 // 4-bit quantization
-// 16 blocks of 32 elements each
+// 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 #ifdef GGML_QKK_64
@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
 #endif
 // 5-bit quantization
-// 16 blocks of 32 elements each
+// 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 #ifdef GGML_QKK_64
@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/
 // 6-bit quantization
 // weight is represented as x = a * q
-// 16 blocks of 16 elemenets each
+// 16 blocks of 16 elements each
 // Effectively 6.5625 bits per weight
 typedef struct {
    uint8_t ql[QK_K/2];      // quants, lower 4 bits
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -42,7 +42,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 1
+#define LLAMA_SESSION_VERSION 2
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@ -167,18 +167,18 @@ extern "C" {
    struct llama_context_params {
        uint32_t seed;            // RNG seed, -1 for random
-        uint32_t n_ctx;           // text context
+        uint32_t n_ctx;           // text context, 0 = from model
-        uint32_t n_batch;         // prompt processing batch size
+        uint32_t n_batch;         // prompt processing maximum batch size
        uint32_t n_threads;       // number of threads to use for generation
        uint32_t n_threads_batch; // number of threads to use for batch processing
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float rope_freq_base;  // RoPE base frequency
+        float rope_freq_base;  // RoPE base frequency, 0 = from model
-        float rope_freq_scale; // RoPE frequency scaling factor
+        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
-        bool f16_kv;     // use fp16 for KV cache
+        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool embedding;  // embedding mode only
    };
@ -282,6 +282,9 @@ extern "C" {
    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
    LLAMA_API int llama_n_embd     (const struct llama_model * model);
    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
    // Get a string describing the model type
    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
@ -330,12 +333,16 @@ extern "C" {
            "avoid using this, it will be removed in the future, instead - count the tokens in user code");
    // Remove all tokens data of cells in [c0, c1)
    // c0 < 0 : [0,  c1]
    // c1 < 0 : [c0, inf)
    LLAMA_API void llama_kv_cache_tokens_rm(
            struct llama_context * ctx,
                         int32_t   c0,
                         int32_t   c1);
    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_cache_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
@ -344,6 +351,8 @@ extern "C" {
    // Copy all tokens that belong to the specified sequence to another sequence
    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_cache_seq_cp(
            struct llama_context * ctx,
                    llama_seq_id   seq_id_src,
@ -358,6 +367,8 @@ extern "C" {
    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
    // If the KV cache is RoPEd, the KV data is updated accordingly
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_cache_seq_shift(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
@ -490,6 +501,11 @@ extern "C" {
    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
    // codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
    LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
    LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
    LLAMA_API llama_token llama_token_eot   (const struct llama_context * ctx); // End of infill middle
    //
    // Tokenization
--- a/models/ggml-vocab-aquila.gguf
+++ b/models/ggml-vocab-aquila.gguf
--- a/models/ggml-vocab-falcon.gguf
+++ b/models/ggml-vocab-falcon.gguf
--- a/prompts/LLM-questions.txt
+++ b/prompts/LLM-questions.txt
@ -0,0 +1,49 @@
 In the context of LLMs, what is "Attention"?
 In the context of LLMs, what is a completion?
 In the context of LLMs, what is a prompt?
 In the context of LLMs, what is GELU?
 In the context of LLMs, what is RELU?
 In the context of LLMs, what is softmax?
 In the context of LLMs, what is decoding?
 In the context of LLMs, what is encoding?
 In the context of LLMs, what is tokenizing?
 In the context of LLMs, what is an embedding?
 In the context of LLMs, what is quantization?
 In the context of LLMs, what is a tensor?
 In the context of LLMs, what is a sparse tensor?
 In the context of LLMs, what is a vector?
 In the context of LLMs, how is attention implemented?
 In the context of LLMs, why is attention all you need?
 In the context of LLMs, what is "RoPe" and what is it used for?
 In the context of LLMs, what is "LoRA" and what is it used for?
 In the context of LLMs, what are weights?
 In the context of LLMs, what are biases?
 In the context of LLMs, what are checkpoints?
 In the context of LLMs, what is "perplexity"?
 In the context of LLMs, what are models?
 In the context of machine-learning, what is "catastrophic forgetting"?
 In the context of machine-learning, what is "elastic weight consolidation (EWC)"?
 In the context of neural nets, what is a hidden layer?
 In the context of neural nets, what is a convolution?
 In the context of neural nets, what is dropout?
 In the context of neural nets, what is cross-entropy?
 In the context of neural nets, what is over-fitting?
 In the context of neural nets, what is under-fitting?
 What is the difference between an interpreted computer language and a compiled computer language?
 In the context of software development, what is a debugger?
 When processing using a GPU, what is off-loading?
 When processing using a GPU, what is a batch?
 When processing using a GPU, what is a block?
 When processing using a GPU, what is the difference between a batch and a block?
 When processing using a GPU, what is a scratch tensor?
 When processing using a GPU, what is a layer?
 When processing using a GPU, what is a cache?
 When processing using a GPU, what is unified memory?
 When processing using a GPU, what is VRAM?
 When processing using a GPU, what is a kernel?
 When processing using a GPU, what is "metal"?
 In the context of LLMs, what are "Zero-Shot", "One-Shot" and "Few-Shot" learning models?
 In the context of LLMs, what is the "Transformer-model" architecture?
 In the context of LLMs, what is "Multi-Head Attention"?
 In the context of LLMs, what is "Self-Attention"?
 In the context of transformer-model architectures, how do attention mechanisms use masks?
--- a/prompts/mnemonics.txt
+++ b/prompts/mnemonics.txt
@ -0,0 +1,93 @@
 For each kanji character, write a Markdown‐formatted mnemonic that uses its keyword and the keyword of all its components.
 Kanji: 欠 (lack of)
 Components: 𠂊 (hook claw), 人 (person)
 Mnemonic: This **person** is a pirate. He lost his hand to a crocodile many years ago. Nowadays, the ***lack of*** a hand does not bother him too much. In fact, the **hook claw** that replaces it is the mark of a true pirate, so he is quite proud of it!
 Kanji: 類 (kind (of something))
 Components: 米 (rice), 大 (large), 頁 (page)
 Mnemonic: The waiter at a Chinese restaurant hands you a **large** menu. Each **page** has all ***kinds*** of **rice** on offer!
 Kanji: 燃 (burn)
 Components: 火 (fire), 然 (sort of thing)
 Mnemonic: ***Burning*** things up with **fire** is just my **sort of thing**. (Spoken like a true pyromaniac.)
 Kanji: 頂 (top of)
 Components: 丁 (street), 頁 (page)
 Mnemonic: To be at the ***top of*** your game, you need both practical knowledge (**street** smarts) and theoretical knowledge (having read many **pages**).
 Kanji: 険 (risky and steep)
 Components: 阝 (small village), 㑒 (consensus)
 Mnemonic: Everyone agrees (there is **consensus**) that the path to the **small village** is ***risky and steep***.
 Kanji: 困 (distressed)
 Components: 囗 (closed box), 木 (tree)
 Mnemonic: You would feel ***distressed*** too if you were a **tree** trapped in a **closed box**! I have no place to grow!
 Kanji: 頭 (head)
 Components: 豆 (bean), 頁 (page)
 Mnemonic: What do you have in that ***head*** of yours? A **bean** for a brain? Go read more **pages** and become more knowledgeable about the world!
 Kanji: 確 (certain)
 Components: 石 (stone), 冖 (roof without a chimney), 隹 (old bird)
 Mnemonic: An **old bird** has made a nest on your **roof**. What do you do? You call Misaka from a <cite>A ***Certain*** Scientific Railgun</cite> to get rid of it, of course! But she doesn’t really want to vaporize the poor thing, so she just throws a **stone** to scare it away. (What was the point of calling her, then‽)
 Kanji: 魚 (fish)
 Components: 𠂊 (hook claw), 田 (rice field), 灬 (fire sparks)
 Mnemonic: Catch ***fish*** with a **hook**, collect rice from the **rice field**, cook them with **fire**… And my meal is ready!
 Kanji: 警 (to police (something))
 Components: 敬 (respect), 言 (say)
 Mnemonic: ***To police something*** is to make people **respect** what the law **says**.
 Kanji: 筆 (writing brush)
 Components: 竹 (bamboo), 聿 (brush)
 Mnemonic: A traditional ***writing brush*** is a **brush** made of **bamboo**.
 Kanji: 獄 (prison)
 Components: 犭 (animal), 言 (say), 犬 (dog)
 Mnemonic: In ***prison***, like in the **animal** kingdom, only the toughest survive. You have to watch what you **say**. It’s a **dog**‐eat‐dog world.
 Kanji: 新 (new)
 Components: 立 (standing up), 木 (tree), 斤 (axe)
 Mnemonic: In order for a ***new*** construction to be made, an empty lot is needed. If there are any **trees** **standing up**, they must be cut down with an **axe**.
 Kanji: 怪 (suspicious)
 Components: 忄 (weak heart), 圣 (sacred)
 Mnemonic: That painting of the **Sacred** **Heart** of Jesus looks ***suspicious***. I think it might be a forgery.
 Kanji: 温 (warm (to the touch))
 Components: 氵 (water drops), 日 (sun), 皿 (dish)
 Mnemonic: If you leave **water** on a **dish** in the **sun**, it will get ***warm***.
 Kanji: 階 (floor (of a building))
 Components: 阝 (small village), 皆 (all)
 Mnemonic: It might be a **small village**, but, despite that, **all** of its buildings have many ***floors***. It’s a village of skyscrapers!
 Kanji: 多 (many)
 Components: 夕 (evening (before sunset)), 夕 (evening (before sunset))
 Mnemonic: Two **evenings** in a day would be one too ***many***.
 Kanji: 別 (separate)
 Components: 口 (mouth), 万 (ten thousand), 刂 (knife)
 Mnemonic: Tom Six is at it again. For his next flick, he wants to stitch together **ten thousand** people, **mouth**‐to‐anus. One of the most graphic and disturbing scenes will feature one of the victims using a **knife** to ***separate*** perself.
 Kanji: 並 (line up)
 Components: 䒑 (antlers on a wall), 业 (runway)
 Mnemonic: In order to land a plane you have to ***line up*** properly with the **runway**. The things that look like **antlers** at the end of the runway are the control towers; you should follow their instructions.
 Kanji: 姿 (figure)
 Components: 次 (next), 女 (woman)
 Mnemonic: The **next** **woman** that I date will have a perfect **figure**. Because I’m done with 3D women—it will *literally* be an anime figure!
 Kanji: 実 (real)
 Components: 宀 (roof with a chimney), 𡗗 (three people)
 Mnemonic: Living under a **roof with a chimney** with **three people** (a wife and two children)—a happy family life—is not something I could have ever imagined. It does not feel ***real***.
 Kanji: 謝 (apologize)
 Components: 言 (say), 射 (shoot)
 Mnemonic: **Shot** first, ***apologize*** (**say** you are sorry) later.
 Kanji: 提 (propose)
 Components: 扌 (left hand), 是 (go with)
 Mnemonic:
--- a/prompts/parallel-questions.txt
+++ b/prompts/parallel-questions.txt
@ -0,0 +1,43 @@
 What do you know about Hobbits?
 What is quantum field theory?
 Why did the chicken cross the road?
 Who is the president of the United States?
 How do I run CMake on MacOS?
 Do you agree that C++ is a really finicky language compared with Python3?
 Is it a good idea to invest in technology?
 Do you like Wagner's Ring?
 Do you think this file input option is really neat?
 What should we all do about climate change?
 Is time-travel possible within the laws of current physics?
 Is it like anything to be a bat?
 Once the chicken has crossed the road, does it try to go back?
 Who is the greatest of all musical composers?
 What is art?
 Is there life elsewhere in the universe?
 What is intelligence?
 What is the difference between knowledge and intelligence?
 Will religion ever die?
 Do we understand ourselves?
 What is the best way to cook eggs?
 If you cannot see things, on what basis do you evaluate them?
 Explain the role of the np junction in photovoltaic cells?
 Is professional sport a good or bad influence on human behaviour?
 Is capital punishment immoral?
 Should we care about other people?
 Who are you?
 Which sense would you surrender if you could?
 Was Henry Ford a hero or a villain?
 Do we need leaders?
 What is nucleosynthesis?
 Who is the greatest scientist of all time?
 Who first observed what came to be known as the photovoltaic effect?
 What is nuclear fusion and why does it release energy?
 Can you know that you exist?
 What is an exoplanet?
 Do you like cream?
 What is the difference?
 Can I know that I exist while I'm dreaming that I'm Descartes?
 Who said "I didn't know I thought that until I heard myself saying it"?
 Does anything really matter?
 Can you explain the unreasonable effectiveness of mathematics?
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,3 @@
-numpy==1.24
+numpy==1.24.4
 sentencepiece==0.1.98
 gguf>=0.1.0
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@ -56,11 +56,13 @@ find_library(llama_LIBRARY llama
    HINTS ${LLAMA_LIB_DIR})
 set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
 set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
 add_library(llama UNKNOWN IMPORTED)
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
        INTERFACE_COMPILE_FEATURES cxx_std_11
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -1,16 +1,18 @@
 #!/bin/bash
-cp -rpv ../ggml/src/ggml.c                ./ggml.c
+cp -rpv ../ggml/src/ggml.c                  ./ggml.c
-cp -rpv ../ggml/src/ggml-alloc.c          ./ggml-alloc.c
+cp -rpv ../ggml/src/ggml-alloc.c            ./ggml-alloc.c
-cp -rpv ../ggml/src/ggml-cuda.h           ./ggml-cuda.h
+cp -rpv ../ggml/src/ggml-backend.c          ./ggml-backend.c
-cp -rpv ../ggml/src/ggml-cuda.cu          ./ggml-cuda.cu
+cp -rpv ../ggml/src/ggml-cuda.h             ./ggml-cuda.h
-cp -rpv ../ggml/src/ggml-opencl.h         ./ggml-opencl.h
+cp -rpv ../ggml/src/ggml-cuda.cu            ./ggml-cuda.cu
-cp -rpv ../ggml/src/ggml-opencl.cpp       ./ggml-opencl.cpp
+cp -rpv ../ggml/src/ggml-opencl.h           ./ggml-opencl.h
-cp -rpv ../ggml/src/ggml-metal.h          ./ggml-metal.h
+cp -rpv ../ggml/src/ggml-opencl.cpp         ./ggml-opencl.cpp
-cp -rpv ../ggml/src/ggml-metal.m          ./ggml-metal.m
+cp -rpv ../ggml/src/ggml-metal.h            ./ggml-metal.h
-cp -rpv ../ggml/src/ggml-metal.metal      ./ggml-metal.metal
+cp -rpv ../ggml/src/ggml-metal.m            ./ggml-metal.m
-cp -rpv ../ggml/include/ggml/ggml.h       ./ggml.h
+cp -rpv ../ggml/src/ggml-metal.metal        ./ggml-metal.metal
-cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h
+cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
 cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
 cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h
 cp -rpv ../ggml/tests/test-opt.cpp    ./tests/test-opt.cpp
 cp -rpv ../ggml/tests/test-grad0.cpp  ./tests/test-grad0.cpp
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -7,9 +7,6 @@ endfunction()
 function(llama_test_executable name source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    # add_executable(${TEST_TARGET} ${source})
    # install(TARGETS ${TEST_TARGET} RUNTIME)
    # target_link_libraries(${TEST_TARGET} PRIVATE llama)
    add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
@ -28,10 +25,12 @@ llama_build_and_test_executable(test-sampling.cpp)
 llama_build_executable(test-tokenizer-0-llama.cpp)
 llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 llama_build_executable(test-tokenizer-0-falcon.cpp)
-#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_build_executable(test-tokenizer-1-llama.cpp)
 llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_build_executable(test-tokenizer-1-bpe.cpp)
 llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -208,26 +208,6 @@ static struct ggml_tensor * get_random_tensor_i32(
    return result;
 }
 static void print_elements(const char* label, const struct ggml_tensor * t) {
    if (!t) {
        printf("%s: %s = null\n", __func__, label);
        return;
    }
    const int nelements = ggml_nelements(t);
    printf("%s: %s = [", __func__, label);
    for (int k = 0; k < nelements; ++k) {
        if (k > 0) { printf(", "); }
        printf("%.5f", ggml_get_f32_1d(t, k));
    }
    printf("] shape: [");
    for (int k = 0; k < t->n_dims; ++k) {
        if (k > 0) { printf(", "); }
        printf("%d", (int)t->ne[k]);
    }
    printf("]\n");
 }
 static bool check_gradient(
        const char * op_name,
        struct ggml_context * ctx0,
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@ -40,27 +40,6 @@ static float frand(void) {
    return (float)rand()/(float)RAND_MAX;
 }
 static int irand(int n) {
    return rand()%n;
 }
 static void get_random_dims(int64_t * dims, int ndims) {
    dims[0] = dims[1] = dims[2] = dims[3] = 1;
    for (int i = 0; i < ndims; i++) {
        dims[i] = 1 + irand(4);
    }
 }
 static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
    dims[0] = dims[1] = dims[2] = dims[3] = 1;
    for (int i = 0; i < ndims; i++) {
        dims[i] = min + irand(max-min);
    }
 }
 static struct ggml_tensor * get_random_tensor(
    struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
 ) {
@ -106,14 +85,6 @@ static struct ggml_tensor * get_random_tensor(
    return result;
 }
 static float get_element(const struct ggml_tensor * t, int idx) {
    return ((float *)t->data)[idx];
 }
 static void set_element(struct ggml_tensor * t, int idx, float value) {
    ((float *)t->data)[idx] = value;
 }
 int main(void) {
    struct ggml_init_params params = {
        /* .mem_size   = */ 1024*1024*1024,
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -76,22 +76,21 @@ static void * align_with_offset(void * ptr, int offset) {
    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
 }
-static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
+static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
    int64_t min_time_us = INT64_MAX;
    int64_t total_time_us = 0;
    int64_t min_time_cycles = INT64_MAX;
    int64_t total_time_cycles = 0;
    for (int i = 0; i < WARMUP; i++) {
-        function();
+        func();
    }
    for (int i = 0; i < iterations; i++) {
        const int64_t start_time = ggml_time_us();
        const int64_t start_cycles = cpu_cycles();
-        function();
+        func();
        const int64_t end_cycles = cpu_cycles();
        const int64_t end_time = ggml_time_us();
@ -245,15 +244,15 @@ int main(int argc, char * argv[]) {
    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_q1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_q2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_out_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
-    float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
+    float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
-    float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
+    float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
-    float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
+    float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
    generate_data(0, largest, test_data1);
    generate_data(1, largest, test_data2);
@ -283,7 +282,7 @@ int main(int argc, char * argv[]) {
                printf("  quantize_row_q_reference\n");
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void ) {
+                    auto quantize_fn = [&](void) -> float {
                        qfns.from_float_reference(test_data1, test_q1, size);
                        return test_q1[0];
                    };
@ -297,7 +296,7 @@ int main(int argc, char * argv[]) {
                printf("  quantize_row_q\n");
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void ) {
+                    auto quantize_fn = [&](void) -> float {
                        qfns.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
@ -312,7 +311,7 @@ int main(int argc, char * argv[]) {
                qfns.from_float(test_data1, test_q1, largest);
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void ) {
+                    auto quantize_fn = [&](void) -> float {
                        qfns.to_float(test_q1, test_out, size);
                        return test_out[0];
                    };
@ -326,7 +325,7 @@ int main(int argc, char * argv[]) {
                printf("  quantize_row_q_dot\n");
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void ) {
+                    auto quantize_fn = [&](void) -> float {
                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
                        vdot.from_float(test_data1, test_q1, size);
                        return test_q1[0];
@ -343,7 +342,7 @@ int main(int argc, char * argv[]) {
                qfns.from_float(test_data2, test_q2, largest);
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void ) {
+                    auto quantize_fn = [&](void) -> float {
                        float result;
                        qfns.vec_dot(size, &result, test_q1, test_q2);
                        return result;
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -1,5 +1,6 @@
 #include "llama.h"
 #include "common.h"
 #include "console.h"
 #include <cstdio>
 #include <string>
@ -35,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
        { "   Hello"              , {     258,  23090, }, },
        { "    Hello"             , {     466,  23090, }, },
        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
        { "\n ="                  , {    1212,     40, }, },
        { "' era"                 , {      18,   4932, }, },
    };
    return _k_tests;
@ -85,12 +88,18 @@ int main(int argc, char **argv) {
    }
    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
+        fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
    }
 #ifdef _WIN32
    // We need this for unicode console support
    console::init(false, false);
    atexit([]() { console::cleanup(); });
 #endif
    bool success = true;
    for (const auto & test_kv : k_tests()) {
@ -148,7 +157,7 @@ int main(int argc, char **argv) {
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
@ -162,10 +171,8 @@ int main(int argc, char **argv) {
            }
            for (const auto & tok : res) {
-                ofs << tok << " ";
+                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
            }
            ofs << "\n";
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@ -41,6 +41,8 @@ tests = [
        "   Hello",
        "    Hello",
        "    Hello\n    Hello",
        "\n =",
        "' era",
    ]
 for text in tests:
@ -69,15 +71,14 @@ fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r') as f:
+    with open(fname_tok, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s)
        # write to file
-        with open(fname_out, 'w') as f:
+        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
-                f.write(str(x) + ' ')
+                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
            f.write('\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)
--- a/Show more
+++ b/Show more