clean up & split PRs

2023-12-21 01:43:08 +01:00 · 2023-12-21 01:43:08 +01:00 · 20171125a8
commit 20171125a8
parent 98366a4047
90 changed files with 13946 additions and 4934 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -15,8 +15,14 @@ indent_size = 4
 [Makefile]
 indent_style = tab

+[scripts/*.mk]
+indent_style = tab
+
 [prompts/*.txt]
 insert_final_newline = unset

 [examples/server/public/*]
 indent_size = 2
+
+[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
+indent_style = tab
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -143,6 +143,9 @@ jobs:
          cd build
          ctest --verbose

+  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  #       how to debug it.
+  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
    runs-on: macos-latest

@ -160,14 +163,18 @@ jobs:
      - name: Build
        id: make_build
        run: |
-          make -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: make_test
        run: |
-          make tests -j $(sysctl -n hw.logicalcpu)
-          make test -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
+          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)

+  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
+  #       how to debug it.
+  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
+  #       would be great if we fix these
  macOS-latest-cmake:
    runs-on: macos-latest

@ -188,7 +195,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake ..
+          cmake -DLLAMA_METAL=OFF ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
--- a/.gitignore
+++ b/.gitignore
@ -101,4 +101,4 @@ poetry.toml
 /tests/test-tokenizer-1-llama
 /tests/test-tokenizer-1-bpe
 /tests/test-rope
-workspace
+/tests/test-backend-ops
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -97,9 +97,9 @@ option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)

-option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)
+option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@ -291,7 +291,12 @@ if (LLAMA_CUBLAS)
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})

        if (LLAMA_STATIC)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            if (WIN32)
+                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
+                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+            else ()
+                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            endif()
        else()
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()
@ -397,57 +402,102 @@ if (LLAMA_HIPBLAS)
    endif()
 endif()

-if (LLAMA_ALL_WARNINGS)
-    if (NOT MSVC)
-        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
-        set(host_cxx_flags "")
+function(get_flags CCID CCVER)
+    set(C_FLAGS "")
+    set(CXX_FLAGS "")

-        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
-            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
+    if (CCID MATCHES "Clang")
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)

-            if (
-                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
-                (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
-            )
-                set(c_flags ${c_flags} -Wdouble-promotion)
-            endif()
-        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
-            set(c_flags ${c_flags} -Wdouble-promotion)
-            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
-
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
-            endif()
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
-            endif()
+        if (
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
+        )
+            set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
+        endif()
+    elseif (CCID STREQUAL "GNU")
+        set(C_FLAGS   -Wdouble-promotion)
+        set(CXX_FLAGS -Wno-array-bounds)
+
+        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
+            set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
+        endif()
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
+            set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
        endif()
-    else()
-        # todo : msvc
    endif()

-    set(c_flags   ${c_flags}   ${warning_flags})
-    set(cxx_flags ${cxx_flags} ${warning_flags})
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
-                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
-                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
+endfunction()

+if (LLAMA_ALL_WARNINGS)
+    if (NOT MSVC)
+        set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        set(C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                          -Werror=implicit-int -Werror=implicit-function-declaration)
+        set(CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+
+        set(C_FLAGS   ${WARNING_FLAGS} ${C_FLAGS})
+        set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
+
+        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+    else()
+        # todo : msvc
+        set(C_FLAGS   "")
+        set(CXX_FLAGS "")
+    endif()
 endif()

-if (NOT MSVC)
-    set(cuda_flags -Wno-pedantic)
-endif()
-set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
+if (LLAMA_CUBLAS)
+    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
+    if (NOT MSVC)
+        set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
+    endif()

-list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
-if (NOT cuda_host_flags STREQUAL "")
-    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
-endif()
+    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
+        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
+        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
+            set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
+        endif()

-add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
+        execute_process(
+            COMMAND ${NVCC_CMD} -Xcompiler --version
+            OUTPUT_VARIABLE CUDA_CCFULLVER
+            ERROR_QUIET
+        )
+
+        if (NOT CUDA_CCFULLVER MATCHES clang)
+            set(CUDA_CCID "GNU")
+            execute_process(
+                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
+                OUTPUT_VARIABLE CUDA_CCVER
+                ERROR_QUIET
+            )
+        else()
+            if (CUDA_CCFULLVER MATCHES Apple)
+                set(CUDA_CCID "AppleClang")
+            else()
+                set(CUDA_CCID "Clang")
+            endif()
+            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
+        endif()
+
+        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
+
+        get_flags(${CUDA_CCID} ${CUDA_CCVER})
+        list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS)  # pass host compiler flags as a single argument
+        if (NOT CUDA_CXX_FLAGS STREQUAL "")
+            set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
+        endif()
+    endif()
+
+    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+endif()

 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@ -471,6 +521,7 @@ endif()
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
    ERROR_VARIABLE output
+    OUTPUT_QUIET
 )
 if (output MATCHES "dyld-1015\.7")
    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
@ -593,6 +644,11 @@ else()
    message(STATUS "Unknown architecture")
 endif()

+if (MINGW)
+    # Target Windows 8 for PrefetchVirtualMemory
+    add_compile_definitions(_WIN32_WINNT=0x602)
+endif()
+
 #
 # POSIX conformance
 #
@ -662,11 +718,11 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_CUDA}   ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/158
+++ b/158
@ -8,7 +8,8 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
+	tests/test-backend-ops

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -25,20 +26,6 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

-ifeq '' '$(findstring clang,$(shell $(CC) --version))'
-	CC_IS_GCC=1
-	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
-else
-	CC_IS_CLANG=1
-	ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
-		CC_IS_LLVM_CLANG=1
-	else
-		CC_IS_APPLE_CLANG=1
-	endif
-	CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
-				| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
-endif
-
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@ -120,12 +107,12 @@ MK_CXXFLAGS = -std=c++11 -fPIC

 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
-MK_CFLAGS        += -Ofast
-MK_HOST_CXXFLAGS += -Ofast
-MK_CUDA_CXXFLAGS += -O3
+MK_CFLAGS     += -Ofast
+HOST_CXXFLAGS += -Ofast
+MK_NVCCFLAGS  += -O3
 else
-MK_CFLAGS        += -O3
-MK_CXXFLAGS      += -O3
+MK_CFLAGS     += -O3
+MK_CXXFLAGS   += -O3
 endif

 # clock_gettime came in POSIX.1b (1993)
@ -219,30 +206,6 @@ MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
 				-Werror=implicit-function-declaration
 MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn

-ifeq ($(CC_IS_CLANG), 1)
-	# clang options
-	MK_CFLAGS        += -Wunreachable-code-break -Wunreachable-code-return
-	MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
-
-	ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
-		MK_CFLAGS += -Wdouble-promotion
-	endif
-	ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
-		MK_CFLAGS += -Wdouble-promotion
-	endif
-else
-	# gcc options
-	MK_CFLAGS        += -Wdouble-promotion
-	MK_HOST_CXXFLAGS += -Wno-array-bounds
-
-	ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
-		MK_HOST_CXXFLAGS += -Wno-format-truncation
-	endif
-	ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
-		MK_HOST_CXXFLAGS += -Wextra-semi
-	endif
-endif
-
 # this version of Apple ld64 is buggy
 ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
 	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@ -293,8 +256,8 @@ ifndef RISCV

 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
-	MK_CFLAGS   += -march=native -mtune=native
-	MK_HOST_CXXFLAGS += -march=native -mtune=native
+	MK_CFLAGS     += -march=native -mtune=native
+	HOST_CXXFLAGS += -march=native -mtune=native

 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
@ -305,12 +268,15 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#MK_CXXFLAGS += -mssse3
 endif

-# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
-# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-# https://github.com/ggerganov/llama.cpp/issues/2922
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
+	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
+	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
+	# https://github.com/ggerganov/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
+
+	# Target Windows 8 for PrefetchVirtualMemory
+	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
 endif

 ifneq ($(filter aarch64%,$(UNAME_M)),)
@ -394,61 +360,64 @@ ifdef LLAMA_CUBLAS
 	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS         += ggml-cuda.o
-	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
+	MK_NVCCFLAGS  = --forward-unknown-to-host-compiler -use_fast_math
+
+ifdef LLAMA_DEBUG
+	MK_NVCCFLAGS += -lineinfo
+endif
+
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
 	NVCC = nvcc
 endif #LLAMA_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
-	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
-else ifdef CUDA_POWER_ARCH
-	NVCCFLAGS +=
-else
-	NVCCFLAGS += -arch=native
+	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+else ifndef CUDA_POWER_ARCH
+	MK_NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
-	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_FORCE_MMQ
-	NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # LLAMA_CUDA_FORCE_MMQ
 ifdef LLAMA_CUDA_DMMV_X
-	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
-	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
 ifdef LLAMA_CUDA_MMV_Y
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 else ifdef LLAMA_CUDA_DMMV_Y
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
 else
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
 ifdef LLAMA_CUDA_F16
-	NVCCFLAGS += -DGGML_CUDA_F16
+	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_F16
 ifdef LLAMA_CUDA_DMMV_F16
-	NVCCFLAGS += -DGGML_CUDA_F16
+	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_DMMV_F16
 ifdef LLAMA_CUDA_KQUANTS_ITER
-	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
-	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
 else
-	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 #ifdef LLAMA_CUDA_CUBLAS
-#	NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
-	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
+	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+	$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST
@ -470,9 +439,15 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 endif # LLAMA_CLBLAST

 ifdef LLAMA_HIPBLAS
-	ROCM_PATH	?= /opt/rocm
-	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
-	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+
+	ifeq ($(wildcard /opt/rocm),)
+		ROCM_PATH	?= /usr
+		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+	else
+		ROCM_PATH	?= /opt/rocm
+		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+	endif
+	HIPCC                   ?= $(ROCM_PATH)/bin/hipcc
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
@ -510,16 +485,22 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

-# combine build flags with cmdline overrides
-override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
-override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
-override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
-override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
-override LDFLAGS       := $(MK_LDFLAGS) $(LDFLAGS)
+GF_CC := $(CC)
+include scripts/get-flags.mk

-# save CXXFLAGS before we add host-only options
-NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
-override CXXFLAGS += $(HOST_CXXFLAGS)
+# combine build flags with cmdline overrides
+override CFLAGS    := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
+BASE_CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
+override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
+override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
+override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
+
+# identify CUDA host compiler
+ifdef LLAMA_CUBLAS
+GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
+include scripts/get-flags.mk
+CUDA_CXXFLAGS := $(GF_CXXFLAGS)
+endif

 #
 # Print build information
@ -729,16 +710,16 @@ tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
 tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
@ -746,3 +727,6 @@ tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)

 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+
+tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -2,33 +2,14 @@

 import PackageDescription

-#if arch(arm) || arch(arm64)
-let platforms: [SupportedPlatform]? = [
-    .macOS(.v12),
-    .iOS(.v14),
-    .watchOS(.v4),
-    .tvOS(.v14)
-]
-let exclude: [String] = []
-let resources: [Resource] = [
-    .process("ggml-metal.metal")
-]
-let additionalSources: [String] = ["ggml-metal.m"]
-let additionalSettings: [CSetting] = [
-    .unsafeFlags(["-fno-objc-arc"]),
-    .define("GGML_USE_METAL")
-]
-#else
-let platforms: [SupportedPlatform]? = nil
-let exclude: [String] = ["ggml-metal.metal"]
-let resources: [Resource] = []
-let additionalSources: [String] = []
-let additionalSettings: [CSetting] = []
-#endif
-
 let package = Package(
    name: "llama",
-    platforms: platforms,
+    platforms: [
+        .macOS(.v12),
+        .iOS(.v14),
+        .watchOS(.v4),
+        .tvOS(.v14)
+    ],
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
@ -36,25 +17,30 @@ let package = Package(
        .target(
            name: "llama",
            path: ".",
-            exclude: exclude,
+            exclude: [],
            sources: [
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "ggml-quants.c",
-            ] + additionalSources,
-            resources: resources,
+                "ggml-metal.m",
+            ],
+            resources: [
+                .process("ggml-metal.metal")
+            ],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE")
+                .define("GGML_USE_ACCELERATE"),
+                .unsafeFlags(["-fno-objc-arc"]),
+                .define("GGML_USE_METAL"),
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
-            ] + additionalSettings,
+            ],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/README.md
+++ b/README.md
@ -1,12 +1,7 @@
 # llama.cpp

-
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

-
-
-
-
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)

 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -15,9 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
+- Collecting Apple Silicon performance stats:
+  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
+  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
+- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167

 ----

@ -100,7 +97,18 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
+- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
+- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
+- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
+
+**Multimodal models:**
+
+- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
+- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
+- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
+- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)


 **Bindings:**
--- a/common/common.cpp
+++ b/common/common.cpp
@ -278,8 +278,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--memory-f32") {
-            params.memory_f16 = false;
+        } else if (arg == "--samplers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.samplers_sequence = parse_samplers_input(argv[i]);
+        } else if (arg == "--sampling-seq") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.samplers_sequence = argv[i];
        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
@ -498,6 +508,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.infill = true;
        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
            params.dump_kv_cache = true;
+        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+            params.no_kv_offload = true;
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
+            params.cache_type_k = argv[++i];
+        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+            params.cache_type_v = argv[++i];
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@ -640,6 +656,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "-h" || arg == "--help") {
            return false;

+        } else if (arg == "--version") {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@ -678,6 +698,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                std::istreambuf_iterator<char>(),
                std::back_inserter(sparams.grammar)
            );
+        } else if (arg == "--override-kv") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            char * sep = strchr(argv[i], '=');
+            if (sep == nullptr || sep - argv[i] >= 128) {
+                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            struct llama_model_kv_override kvo;
+            std::strncpy(kvo.key, argv[i], sep - argv[i]);
+            kvo.key[sep - argv[i]] = 0;
+            sep++;
+            if (strncmp(sep, "int:", 4) == 0) {
+                sep += 4;
+                kvo.tag = LLAMA_KV_OVERRIDE_INT;
+                kvo.int_value = std::atol(sep);
+            } else if (strncmp(sep, "float:", 6) == 0) {
+                sep += 6;
+                kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
+                kvo.float_value = std::atof(sep);
+            } else if (strncmp(sep, "bool:", 5) == 0) {
+                sep += 5;
+                kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
+                if (std::strcmp(sep, "true") == 0) {
+                    kvo.bool_value = true;
+                } else if (std::strcmp(sep, "false") == 0) {
+                    kvo.bool_value = false;
+                } else {
+                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
+                    invalid_param = true;
+                    break;
+                }
+            } else {
+                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            params.kv_overrides.push_back(kvo);
 #ifndef LOG_DISABLE_LOGS
        // Parse args for logging parameters
        } else if ( log_param_single_parse( argv[i] ) ) {
@ -721,6 +782,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        }
    }

+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back(llama_model_kv_override());
+        params.kv_overrides.back().key[0] = 0;
+    }
+
    return true;
 }

@ -732,6 +798,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help            show this help message and exit\n");
+    printf("      --version         show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
@ -761,6 +828,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
+    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@ -798,8 +867,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@ -840,6 +907,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --verbose-prompt      print prompt before generation\n");
    printf("  -dkvc, --dump-kv-cache\n");
    printf("                        verbose print of the KV cache\n");
+    printf("  -nkvo, --no-kv-offload\n");
+    printf("                        disable KV offload\n");
+    printf("  -ctk TYPE, --cache-type-k TYPE\n");
+    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
+    printf("  -ctv TYPE, --cache-type-v TYPE\n");
+    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@ -850,6 +923,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
+    printf("  --override-kv KEY=TYPE:VALUE\n");
+    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
+    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@ -886,6 +962,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    GGML_UNREACHABLE();
 }

+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input) {
+    std::string output = "";
+    // since samplers names are written multiple ways
+    // make it ready for both system names and input names
+    std::unordered_map<std::string, char> samplers_symbols {
+        {"top_k",      'k'},
+        {"top-k",      'k'},
+        {"top_p",      'p'},
+        {"top-p",      'p'},
+        {"nucleus",    'p'},
+        {"typical_p",  'y'},
+        {"typical-p",  'y'},
+        {"typical",    'y'},
+        {"min_p",      'm'},
+        {"min-p",      'm'},
+        {"tfs_z",      'f'},
+        {"tfs-z",      'f'},
+        {"tfs",        'f'},
+        {"temp",       't'},
+        {"temperature",'t'}
+    };
+    // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
+    size_t separator = input.find(';');
+    while (separator != input.npos) {
+        std::string name = input.substr(0,separator);
+        input = input.substr(separator+1);
+        separator = input.find(';');
+
+        if (samplers_symbols.find(name) != samplers_symbols.end()) {
+            output += samplers_symbols[name];
+        }
+    }
+    if (samplers_symbols.find(input) != samplers_symbols.end()) {
+        output += samplers_symbols[input];
+    }
+    return output;
+}
+
 //
 // Model utils
 //
@ -900,10 +1018,39 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
+    if (params.kv_overrides.empty()) {
+        mparams.kv_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
+        mparams.kv_overrides = params.kv_overrides.data();
+    }

    return mparams;
 }

+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+
+    throw std::runtime_error("Invalid cache type: " + s);
+}
+
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();

@ -913,7 +1060,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    cparams.mul_mat_q         = params.mul_mat_q;
    cparams.seed              = params.seed;
-    cparams.f16_kv            = params.memory_f16;
    cparams.logits_all        = params.logits_all;
    cparams.embedding         = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
@ -924,6 +1070,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_fast    = params.yarn_beta_fast;
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
+    cparams.offload_kqv       = !params.no_kv_offload;
+
+    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
+    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

    return cparams;
 }
@ -1336,7 +1486,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
--- a/common/common.h
+++ b/common/common.h
@ -86,6 +86,8 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files

+    std::vector<llama_model_kv_override> kv_overrides;
+
    // TODO: avoid tuple, use struct
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter
@ -98,7 +100,6 @@ struct gpt_params {
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
@ -123,10 +124,14 @@ struct gpt_params {
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
+    bool no_kv_offload     = false; // disable KV offloading
+
+    std::string cache_type_k = "f16"; // KV cache data type for the K
+    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
    std::string mmproj = ""; // path to multimodal projector
-    std::string image = ""; // path to an image file
+    std::string image  = ""; // path to an image file
 };

 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
@ -141,6 +146,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);

 void process_escapes(std::string& input);

+//
+// String parsing
+//
+
+std::string parse_samplers_input(std::string input);
+
 //
 // Model utils
 //
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@ -190,7 +190,7 @@ namespace grammar_parser {
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                }

                // apply transformation to previous symbol (last_sym_start to end) according to
--- a/common/log.h
+++ b/common/log.h
@ -61,13 +61,13 @@
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
-//  The log target can also be redirected to a diffrent function
+//  The log target can also be redirected to a different function
 //  like so:
 //
-//  #define LOG_TARGET log_handler_diffrent()
+//  #define LOG_TARGET log_handler_different()
 //  #include "log.h"
 //
-//  FILE* log_handler_diffrent()
+//  FILE* log_handler_different()
 //  {
 //      return stderr;
 //  }
@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS

 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
-//  untill enabled back.
+//  until enabled back.
 #define log_disable() log_disable_impl()

 // INTERNAL, DO NOT USE
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -99,6 +99,56 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
    return std::string(result);
 }

+std::string llama_sampling_order_print(const llama_sampling_params & params) {
+    std::string result = "CFG -> Penalties ";
+    if (params.mirostat == 0) {
+        for (auto s : params.samplers_sequence) {
+            switch (s) {
+                case 'k': result += "-> top_k "; break;
+                case 'f': result += "-> tfs_z "; break;
+                case 'y': result += "-> typical_p "; break;
+                case 'p': result += "-> top_p "; break;
+                case 'm': result += "-> min_p "; break;
+                case 't': result += "-> temp "; break;
+                default : break;
+            }
+        }
+    } else {
+        result += "-> mirostat ";
+    }
+
+    return result;
+}
+
+// no reasons to expose this function in header
+static void sampler_queue(
+                   struct llama_context * ctx_main,
+            const llama_sampling_params & params,
+                 llama_token_data_array & cur_p,
+                                 size_t & min_keep) {
+    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
+
+    const float         temp              = params.temp;
+    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float         top_p             = params.top_p;
+    const float         min_p             = params.min_p;
+    const float         tfs_z             = params.tfs_z;
+    const float         typical_p         = params.typical_p;
+    const std::string & samplers_sequence = params.samplers_sequence;
+
+    for (auto s : samplers_sequence) {
+        switch (s){
+            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
+            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
+            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
+            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
+            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
+            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
+            default : break;
+        }
+    }
+}
+
 llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
@ -109,11 +159,6 @@ llama_token llama_sampling_sample(
    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   min_p           = params.min_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
@ -188,12 +233,7 @@ llama_token llama_sampling_sample(
            // temperature sampling
            size_t min_keep = std::max(1, params.n_probs);

-            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
-            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
-            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
-            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
-            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
-            llama_sample_temp     (ctx_main, &cur_p, temp);
+            sampler_queue(ctx_main, params, cur_p, min_keep);

            id = llama_sample_token(ctx_main, &cur_p);

--- a/common/sampling.h
+++ b/common/sampling.h
@ -10,22 +10,23 @@

 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.10f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = true;  // consider newlines as a repeatable token
+    int32_t     n_prev                = 64;       // number of previous tokens to remember
+    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     top_k                 = 40;       // <= 0 to use vocab size
+    float       top_p                 = 0.95f;    // 1.0 = disabled
+    float       min_p                 = 0.05f;    // 0.0 = disabled
+    float       tfs_z                 = 1.00f;    // 1.0 = disabled
+    float       typical_p             = 1.00f;    // 1.0 = disabled
+    float       temp                  = 0.80f;    // 1.0 = disabled
+    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
+    float       penalty_freq          = 0.00f;    // 0.0 = disabled
+    float       penalty_present       = 0.00f;    // 0.0 = disabled
+    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;    // target entropy
+    float       mirostat_eta          = 0.10f;    // learning rate
+    bool        penalize_nl           = true;     // consider newlines as a repeatable token
+    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp

    std::string grammar;  // optional BNF-like grammar to constrain sampling

@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
 // Print sampling parameters into a string
 std::string llama_sampling_print(const llama_sampling_params & params);

+// Print sampling order into a string
+std::string llama_sampling_order_print(const llama_sampling_params & params);
+
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
--- a/common/train.cpp
+++ b/common/train.cpp
@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)

 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
    float scale = 1.0f; // xavier
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
        case 1:
            scale /= sqrtf((float) tensor->ne[0]);
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
 }

 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
        case 1:
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
 }

 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->n_dims == 1);
    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == 1);
+    GGML_ASSERT(tensor->ne[2] == 1);
+    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->n_dims == 2);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == 1);
+    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->n_dims == 3);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
+    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->n_dims == 4);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
    bool                   sample_random_offsets
 ) {
    GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(tokens_input->n_dims  == 2);
-    GGML_ASSERT(target_probs->n_dims  == 3);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
+    GGML_ASSERT(ggml_is_3d(target_probs));
    int64_t n_vocab  = target_probs->ne[0];
    int64_t n_tokens = tokens_input->ne[0];
    int64_t n_batch  = tokens_input->ne[1];
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -77,8 +77,18 @@ class Model:
            self.gguf_writer.add_embedding_length(n_embd)
        if (n_ff := self.hparams.get("intermediate_size")) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
-        if (n_head := self.hparams.get("num_attention_head")) is not None:
+        if (n_head := self.hparams.get("num_attention_heads")) is not None:
            self.gguf_writer.add_head_count(n_head)
+        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
+            self.gguf_writer.add_head_count_kv(n_head_kv)
+
+        if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
+            self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
+        if (n_experts := self.hparams.get("num_local_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+            self.gguf_writer.add_expert_used_count(n_experts_used)
+
        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))

    def write_tensors(self):
@ -170,6 +180,10 @@ class Model:
            return StableLMModel
        if model_architecture == "QWenLMHeadModel":
            return QwenModel
+        if model_architecture == "MixtralForCausalLM":
+            return MixtralModel
+        if model_architecture == "PhiForCausalLM":
+            return Phi2Model
        return Model

    def _is_model_safetensors(self) -> bool:
@ -207,6 +221,10 @@ class Model:
            return gguf.MODEL_ARCH.STABLELM
        if arch == "QWenLMHeadModel":
            return gguf.MODEL_ARCH.QWEN
+        if arch == "MixtralForCausalLM":
+            return gguf.MODEL_ARCH.LLAMA
+        if arch == "PhiForCausalLM":
+            return gguf.MODEL_ARCH.PHI2

        raise NotImplementedError(f'Architecture "{arch}" not supported!')

@ -837,6 +855,11 @@ class StableLMModel(Model):
        self.gguf_writer.add_layer_norm_eps(1e-5)


+class MixtralModel(Model):
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+
 class QwenModel(Model):
    @staticmethod
    def token_bytes_to_string(b):
@ -961,6 +984,24 @@ class QwenModel(Model):
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)

+
+class Phi2Model(Model):
+    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
+        self.gguf_writer.add_name("Phi2")
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_add_bos_token(False)
+
+
 ###### CONVERSION LOGIC ######


--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -3,7 +3,6 @@ from __future__ import annotations

 import json
 import os
-import re
 import struct
 import sys
 from typing import Any, BinaryIO, Sequence
@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch

+from pathlib import Path
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}


-HF_SUBLAYER_TO_GGML = {
-    "self_attn.q_proj": "attn_q",
-    "self_attn.k_proj": "attn_k",
-    "self_attn.v_proj": "attn_v",
-    "self_attn.o_proj": "attn_output",
-    "mlp.gate_proj": "ffn_gate",
-    "mlp.down_proj": "ffn_down",
-    "mlp.up_proj": "ffn_up",
-    "input_layernorm": "attn_norm",
-    "post_attention_layernorm": "ffn_norm",
-}
-
-
-def translate_tensor_name(t: str) -> str:
-    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
-    if match:
-        nn = match.group(1)
-        sub_layer = match.group(2)
-        lora_type = match.group(3)
-
-        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
-        if sub_layer_renamed is None:
-            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
-            sys.exit(1)
-
-        output_string = (
-            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
-        )
-        return output_string
-    else:
-        print(f"Error: unrecognized tensor {t}")
-        sys.exit(1)
-
-
 def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(struct.pack("i", int(params["lora_alpha"])))


-def write_tensor_header(
-    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
-) -> None:
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
@ -78,11 +47,12 @@ def write_tensor_header(
    fout.seek((fout.tell() + 31) & -32)


-if len(sys.argv) != 2:
-    print(f"Usage: python {sys.argv[0]} <path>")
+if len(sys.argv) < 2:
+    print(f"Usage: python {sys.argv[0]} <path> [arch]")
    print(
        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
    )
+    print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
    sys.exit(1)

 input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
 output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")

 model = torch.load(input_model, map_location="cpu")
+arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+
+if arch_name not in gguf.MODEL_ARCH_NAMES.values():
+    print(f"Error: unsupported architecture {arch_name}")
+    sys.exit(1)
+
+arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
+name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone

 with open(input_json, "r") as f:
    params = json.load(f)
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:

    write_file_header(fout, params)
    for k, v in model.items():
+        orig_k = k
        if k.endswith(".default.weight"):
            k = k.replace(".default.weight", ".weight")
        if k in ["llama_proj.weight", "llama_proj.bias"]:
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
            v = v.float()

        t = v.detach().numpy()
-        tname = translate_tensor_name(k)
+
+        prefix = "base_model.model."
+        if k.startswith(prefix):
+            k = k[len(prefix) :]
+
+        lora_suffixes = (".lora_A.weight", ".lora_B.weight")
+        if k.endswith(lora_suffixes):
+            suffix = k[-len(lora_suffixes[0]):]
+            k = k[: -len(lora_suffixes[0])]
+        else:
+            print(f"Error: unrecognized tensor name {orig_k}")
+            sys.exit(1)
+
+        tname = name_map.get_name(k)
+        if tname is None:
+            print(f"Error: could not map tensor name {orig_k}")
+            print(" Note: the arch parameter must be specified if the model is not llama")
+            sys.exit(1)
+
+        if suffix == ".lora_A.weight":
+            tname += ".weight.loraA"
+        elif suffix == ".lora_B.weight":
+            tname += ".weight.loraB"
+        else:
+            assert False
+
        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
        write_tensor_header(fout, tname, t.shape, t.dtype)
        t.tofile(fout)
--- a/convert.py
+++ b/convert.py
@ -10,6 +10,7 @@ import itertools
 import json
 import math
 import mmap
+import os
 import pickle
 import re
 import signal
@ -18,15 +19,15 @@ import sys
 import time
 import zipfile
 from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast

 import numpy as np
 from sentencepiece import SentencePieceProcessor

-import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -42,6 +43,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8
+
 #
 # data types
 #
@ -62,10 +64,10 @@ class UnquantizedDataType(DataType):
    pass


-DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
-DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
-DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
-DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
+DT_F16  = UnquantizedDataType('F16',  dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
+DT_F32  = UnquantizedDataType('F32',  dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
+DT_I32  = UnquantizedDataType('I32',  dtype = np.dtype(np.int16),   valid_conversions = [])
+DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16),  valid_conversions = ['F32', 'F16', 'Q8_0'])


@dataclass(frozen=True)
@ -151,14 +153,16 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {

@dataclass
 class Params:
-    n_vocab:    int
-    n_embd:     int
-    n_layer:    int
-    n_ctx:      int
-    n_ff:       int
-    n_head:     int
-    n_head_kv:  int
-    f_norm_eps: float
+    n_vocab:        int
+    n_embd:         int
+    n_layer:        int
+    n_ctx:          int
+    n_ff:           int
+    n_head:         int
+    n_head_kv:      int
+    n_experts:      int | None = None
+    n_experts_used: int | None = None
+    f_norm_eps:     float | None = None

    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
@ -233,6 +237,13 @@ class Params:
            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

+        n_experts      = None
+        n_experts_used = None
+
+        if "num_local_experts" in config:
+            n_experts = config["num_local_experts"]
+            n_experts_used = config["num_experts_per_tok"]
+
        return Params(
            n_vocab           = config["vocab_size"],
            n_embd            = config["hidden_size"],
@ -241,6 +252,8 @@ class Params:
            n_ff              = config["intermediate_size"],
            n_head            = (n_head := config["num_attention_heads"]),
            n_head_kv         = config.get("num_key_value_heads", n_head),
+            n_experts         = n_experts,
+            n_experts_used    = n_experts_used,
            f_norm_eps        = config["rms_norm_eps"],
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
@ -255,8 +268,15 @@ class Params:
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

+        n_experts      = None
+        n_experts_used = None
+        f_rope_freq_base = None
+
        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if config.get("rope_theta") == 1000000:
+        if config.get("moe"):
+            # Mixtral
+            n_ctx = 32768
+        elif config.get("rope_theta") == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@ -266,16 +286,27 @@ class Params:
            # LLaMA v1
            n_ctx = 2048

+        if "layers.0.feed_forward.w1.weight" in model:
+            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
+
+        if config.get("moe"):
+            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
+            n_experts      = config["moe"]["num_experts"]
+            n_experts_used = config["moe"]["num_experts_per_tok"]
+            f_rope_freq_base = 1e6
+
        return Params(
            n_vocab          = model["tok_embeddings.weight"].shape[0],
            n_embd           = config["dim"],
            n_layer          = config["n_layers"],
            n_ctx            = n_ctx,
-            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
+            n_ff             = n_ff,
            n_head           = (n_head := config["n_heads"]),
            n_head_kv        = config.get("n_kv_heads", n_head),
+            n_experts        = n_experts,
+            n_experts_used   = n_experts_used,
            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config.get("rope_theta"),
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
        )

    @staticmethod
@ -297,127 +328,138 @@ class Params:
        return params


-#
-# vocab
-#
+class VocabLoader:
+    def __init__(self, params: Params, fname_tokenizer: Path) -> None:
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "To use VocabLoader, please install the `transformers` package. "
+                "You can install it with `pip install transformers`."
+            ) from e

-class BpeVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
+        except ValueError:
+            self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
+
+        self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
+
+        for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
+            if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
+                continue
+
+            self.added_tokens_dict[tok] = tokidx
+
+        self.unk_token_id: int = self.tokenizer.unk_token_id
+        self.specials: dict[str, int] = {
+            tok: self.tokenizer.get_vocab()[tok]
+            for tok in self.tokenizer.all_special_tokens
+        }
+        self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
+        self.vocab_size_base: int = self.tokenizer.vocab_size
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
+        self.fname_tokenizer: Path = fname_tokenizer
+
+        vocab_file = "tokenizer.model"
+        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
+        if path_candidate is not None:
+            self.spm = SentencePieceProcessor(str(path_candidate))
+            print(self.spm.vocab_size(), self.vocab_size_base)
        else:
-            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
-            if not tokenizer_json_file.is_file():
-                added_tokens = {}
-            else:
-                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
-                added_tokens = dict(
-                    (item['content'], item['id'])
-                    for item in tokenizer_json.get('added_tokens', [])
-                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in self.bpe_tokenizer)
+            self.spm = None

-        vocab_size: int = len(self.bpe_tokenizer)
-        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids      = sorted(added_tokens.values())
-        if expected_ids != actual_ids:
-            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.tokenizer
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
+        added_tokens_ids = set(self.added_tokens_dict.values())

-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_list    = [text for (text, idx) in items]
-        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer      = fname_tokenizer
-        self.fname_added_tokens   = fname_added_tokens
+        for i in range(self.vocab_size_base):
+            if i in added_tokens_ids:
+                continue

-    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        tokenizer = self.bpe_tokenizer
-        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
+            text = reverse_vocab[i].encode("utf-8")
+            yield text, self.get_token_score(i), self.get_token_type(i)

-        for i, _ in enumerate(tokenizer):
-            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
+    def get_token_type(self, token_id: int) -> gguf.TokenType:
+        toktype = gguf.TokenType.NORMAL

-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
-
-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.bpe_tokens()
-        yield from self.added_tokens()
-
-    def __repr__(self) -> str:
-        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
-
-
-class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
-        added_tokens: dict[str, int]
-        if fname_added_tokens is not None:
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
-        else:
-            added_tokens = {}
-
-        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
-
-        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
-        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids   = sorted(new_tokens.keys())
-
-        if expected_new_ids != actual_new_ids:
-            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
-
-        # Token pieces that were added to the base vocabulary.
-        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base    = vocab_size
-        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer    = fname_tokenizer
-        self.fname_added_tokens = fname_added_tokens
-
-    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        tokenizer = self.sentencepiece_tokenizer
-        for i in range(tokenizer.vocab_size()):
-            piece = tokenizer.id_to_piece(i)
-            text: bytes = piece.encode("utf-8")
-            score: float = tokenizer.get_score(i)
-
-            toktype = gguf.TokenType.NORMAL
-            if tokenizer.is_unknown(i):
+        if self.spm is not None and token_id < self.spm.vocab_size():
+            if self.spm.is_unknown(token_id):
                toktype = gguf.TokenType.UNKNOWN
-            if tokenizer.is_control(i):
+            if self.spm.is_control(token_id):
+                toktype = gguf.TokenType.CONTROL
+            if self.spm.is_unused(token_id):
+                toktype = gguf.TokenType.UNUSED
+            if self.spm.is_byte(token_id):
+                toktype = gguf.TokenType.BYTE
+        else:
+            if token_id == self.unk_token_id:
+                toktype = gguf.TokenType.UNKNOWN
+            if token_id in self.special_ids:
                toktype = gguf.TokenType.CONTROL

-            # NOTE: I think added_tokens are user defined.
-            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
-            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
+        return toktype

-            if tokenizer.is_unused(i):
-                toktype = gguf.TokenType.UNUSED
-            if tokenizer.is_byte(i):
-                toktype = gguf.TokenType.BYTE
-
-            yield text, score, toktype
+    def get_token_score(self, token_id: int) -> float:
+        if self.spm is not None and token_id < self.spm.vocab_size():
+            return cast(float, self.spm.get_score(token_id))
+        return 0.0

    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
-            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+        for text in self.added_tokens_dict:
+            if text in self.specials:
+
+                toktype = self.get_token_type(self.specials[text])
+                score = self.get_token_score(self.specials[text])
+
+            else:
+                toktype = gguf.TokenType.USER_DEFINED
+                score = -1000.0
+
+            yield text.encode("utf-8"), score, toktype
+
+    def has_newline_token(self) -> bool:
+        return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab

    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.sentencepiece_tokens()
+        yield from self.hf_tokens()
        yield from self.added_tokens()

+    def get_vocab_type(self) -> str:
+        path_candidates = []
+        vocab_file = "tokenizer.model"
+        path_candidates.append(vocab_file)
+        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
+        if path_candidate is not None:
+            return "llama"
+
+        vocab_file = "vocab.json"
+        path_candidates.append(vocab_file)
+        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
+        if path_candidate is not None:
+            return "gpt2"
+
+        vocab_file = "tokenizer.json"
+        path_candidates.append(vocab_file)
+        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
+        if path_candidate:
+            if not self.has_newline_token():
+                return "gpt2"
+            return "llama"
+
+        raise FileNotFoundError(
+            f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
+            "if it's in another directory, pass the directory as --vocab-dir"
+        )
+
    def __repr__(self) -> str:
-        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"


-Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
+Vocab: TypeAlias = 'VocabLoader'
+

 #
 # data loading
@ -585,7 +627,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:

    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
        # Transformers models put different tensors in different files, but
-        # don't split indivdual tensors between files.
+        # don't split individual tensors between files.
        model: LazyModel = {}
        for mp in models_plus:
            model.update(mp.model)
@ -678,7 +720,7 @@ class LazyUnpickler(pickle.Unpickler):
        return func(*args)

    CLASSES: dict[tuple[str, str], Any] = {
-        # getattr used here as a workaround for mypy not being smart enough to detrmine
+        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
@ -794,20 +836,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
            yield result


-def check_vocab_size(params: Params, vocab: Vocab) -> None:
+def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
    if params.n_vocab != vocab.vocab_size:
-        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
-        if params.n_vocab == vocab.vocab_size_base:
+        if params.n_vocab == vocab.vocab_size:
            print("Ignoring added_tokens.json since model matches vocab size without it.")
-            vocab.added_tokens_list = []
-            vocab.vocab_size = vocab.vocab_size_base
+            vocab.added_tokens_dict = OrderedDict()
+            vocab.vocab_size = vocab.vocab_size
+            return
+
+        if pad_vocab and params.n_vocab > vocab.vocab_size:
+            pad_count = params.n_vocab - vocab.vocab_size
+            print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
+            for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
+                vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
+            vocab.vocab_size = params.n_vocab
            return
        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
-        if vocab.fname_added_tokens is not None:
-            msg += f" combined with {vocab.fname_added_tokens}"
        msg += f" has {vocab.vocab_size})."
-        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        if vocab.vocab_size < params.n_vocab:
+            msg += " Possibly try using the --padvocab option."
        raise Exception(msg)


@ -832,7 +881,17 @@ class OutputFile:
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
-        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
+
+        if params.n_experts:
+            self.gguf.add_expert_count(params.n_experts)
+
+        if params.n_experts_used:
+            self.gguf.add_expert_used_count(params.n_experts_used)
+
+        if params.f_norm_eps:
+            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
+        else:
+            raise ValueError('f_norm_eps is None')

        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -861,12 +920,8 @@ class OutputFile:
            scores.append(score)
            toktypes.append(toktype)

-        if isinstance(vocab, SentencePieceVocab):
-            self.gguf.add_tokenizer_model("llama")
-        elif isinstance(vocab, BpeVocab):
-            self.gguf.add_tokenizer_model("gpt2")
-        else:
-            raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
+        vocab_type = vocab.get_vocab_type()
+        self.gguf.add_tokenizer_model(vocab_type)
        self.gguf.add_token_list(tokens)
        self.gguf.add_token_scores(scores)
        self.gguf.add_token_types(toktypes)
@ -892,8 +947,12 @@ class OutputFile:
        self.gguf.close()

    @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
-        check_vocab_size(params, vocab)
+    def write_vocab_only(
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool = False,
+    ) -> None:
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

@ -920,8 +979,13 @@ class OutputFile:
        return dt.quantize(arr)

    @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
-        check_vocab_size(params, vocab)
+    def write_all(
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        concurrency: int = DEFAULT_CONCURRENCY,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool = False,
+    ) -> None:
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

@ -956,7 +1020,7 @@ class OutputFile:


 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
-    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
+    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type

    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
        return GGMLFileType.AllF32
@ -1079,35 +1143,17 @@ def load_some_model(path: Path) -> ModelPlus:
    return model_plus


-def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
-    # Be extra-friendly and accept either a file or a directory.  Also, if it's
-    # a directory, it might be the model directory, and tokenizer.model might
-    # be in the parent of that.
-    if path.is_dir():
-        vocab_file = "tokenizer.model"
-        if vocabtype == 'bpe':
-            vocab_file = "vocab.json"
-        path2 = path / vocab_file
-        # Use `.parent` instead of /.. to handle the symlink case better.
-        path3 = path.parent / vocab_file
-        if path2.exists():
-            path = path2
-        elif path3.exists():
-            path = path3
-        else:
-            raise FileNotFoundError(
-                f"Could not find {vocab_file} in {path} or its parent; "
-                "if it's in another directory, pass the directory as --vocab-dir")
+def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
+    path2 = path / vocab_file
+    # Use `.parent` instead of /.. to handle the symlink case better.
+    path3 = path.parent / vocab_file

-    print(f"Loading vocab file '{path}', type '{vocabtype}'")
+    if path2.exists():
+        return path2
+    if path3.exists():
+        return path3

-    added_tokens_path = path.parent / "added_tokens.json"
-    if vocabtype == "bpe":
-        return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
-    elif vocabtype == "spm":
-        return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
-    else:
-        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+    return None


 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@ -1145,11 +1191,11 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
-    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
-    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")

    args = parser.parse_args(args_in)
    if args.dump_single:
@ -1192,12 +1238,13 @@ def main(args_in: list[str] | None = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        # FIXME: Try to respect vocab_dir somehow?
-        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
+        vocab = VocabLoader(params, args.vocab_dir or args.model)
        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-                                          load_merges = args.vocabtype == 'bpe',
+                                          load_merges = True,
                                          n_vocab = vocab.vocab_size)
        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
+                                    endianess = endianess, pad_vocab = args.padvocab)
        print(f"Wrote {outfile}")
        return

@ -1205,12 +1252,15 @@ def main(args_in: list[str] | None = None) -> None:
        vocab = model_plus.vocab
    else:
        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-        vocab = load_vocab(vocab_dir, args.vocabtype)
+        vocab = VocabLoader(params, vocab_dir)
+
    # FIXME: Try to respect vocab_dir somehow?
+    print(f"Vocab info: {vocab}")
    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-                                      load_merges = args.vocabtype == 'bpe',
+                                      load_merges = True,
                                      n_vocab = vocab.vocab_size)

+    print(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params)
    ftype   = pick_output_type(model, args.outtype)
@ -1220,7 +1270,8 @@ def main(args_in: list[str] | None = None) -> None:
    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")

-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
+                         concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
    print(f"Wrote {outfile}")


--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
 }

 static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(logits->n_dims == 2);
-    assert(probs->n_dims == 2);
-    assert(best_samples->n_dims == 1);
+    assert(ggml_is_matrix(logits));
+    assert(ggml_is_matrix(probs));
+    assert(ggml_is_vector(best_samples));
    assert(logits->ne[1] == best_samples->ne[0]);
    assert(logits->ne[0] == probs->ne[0]);
    assert(logits->ne[1] == probs->ne[1]);
@ -1292,9 +1292,9 @@ static void sample_softmax_batch(
    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
    struct ggml_tensor * best_samples
 ) {
-    GGML_ASSERT(best_samples->n_dims == 2);
-    GGML_ASSERT(logits->n_dims == 3);
-    GGML_ASSERT(probs->n_dims == 3);
+    GGML_ASSERT(ggml_is_matrix(best_samples));
+    GGML_ASSERT(ggml_is_3d(logits));
+    GGML_ASSERT(ggml_is_3d(probs));
    int n_tokens = best_samples->ne[0];
    int n_batch  = best_samples->ne[1];
    int n_vocab  = logits->ne[0];
@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }

 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
 static void get_example_targets_batch(
    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
 ) {
-    GGML_ASSERT(tokens_input->n_dims == 2);
-    GGML_ASSERT(     targets->n_dims == 3);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
+    GGML_ASSERT(ggml_is_3d(targets));
    int n_tokens = tokens_input->ne[0];
    int n_batch  = tokens_input->ne[1];
    GGML_ASSERT(n_tokens == targets->ne[1]);
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
 llama_print_timings(context)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let n_tokens = text.count + (add_bos ? 1 : 0)
+    let utf8Count = text.utf8.count
+    let n_tokens = utf8Count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -129,13 +129,13 @@ int main(int argc, char ** argv)  {
    const ggml_type qtype = GGML_TYPE_Q4_1;

    size_t ctx_size = 0;
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
    ctx_size += 1024*1024*16;

    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }

 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab

 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
    int ct;
-    switch (gg_weights->n_dims){
+    switch (ggml_n_dims(gg_weights)) {
        case 1:
            ct = 0;
            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
        name = ggml_get_name(tensor);
    }
    uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
+    uint32_t nd = ggml_n_dims(tensor);
    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                       (uint32_t)tensor->ne[1],
                       (uint32_t)tensor->ne[2],
@ -1620,8 +1620,6 @@ int main(int argc, char ** argv) {
    opt->params.adam.gclip              = params.common.adam_gclip;
    opt->params.adam.eps_f              = params.common.adam_eps_f;

-    ggml_allocr * alloc = NULL;
-
    printf("%s: init model\n", __func__);
    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);

@ -1725,10 +1723,9 @@ int main(int argc, char ** argv) {

    // allocate input tensors
    mem_input_data.resize(max_input_size);
-    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
-    ggml_allocr_free(alloc);
+    ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_allocr_alloc(alloc_inps, tokens_input);
+    ggml_allocr_alloc(alloc_inps, target_probs);

    // context for compute tensors without their data
    const size_t estimated_compute_size_wo_data = (
@ -1755,7 +1752,7 @@ int main(int argc, char ** argv) {
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
-        alloc = ggml_allocr_new_measure(tensor_alignment);
+        ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
        gf->order = (enum ggml_cgraph_eval_order) order;
        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1788,7 +1785,7 @@ int main(int argc, char ** argv) {
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
-    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
    gf->order = best_order;
    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1804,6 +1801,8 @@ int main(int argc, char ** argv) {
        params.common.use_checkpointing
    );
    ggml_allocr_free(alloc);
+    ggml_allocr_free(alloc_inps);
+

    // tokenize data
    std::vector<llama_token> train_tokens;
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {

            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);

            // print first 10 elements
            const float * data = (const float *) cur->data;
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -53,6 +53,13 @@ static std::vector<T> split(const std::string & str, char delim) {
    return values;
 }

+template<typename T, typename F>
+static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
+    std::vector<std::string> str_values;
+    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
+    return str_values;
+}
+
 template<typename T>
 static T avg(const std::vector<T> & v) {
    if (v.empty()) {
@ -126,7 +133,8 @@ struct cmd_params {
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
    std::vector<int> n_batch;
-    std::vector<bool> f32_kv;
+    std::vector<ggml_type> type_k;
+    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<int> main_gpu;
@ -142,7 +150,8 @@ static const cmd_params cmd_params_defaults = {
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
    /* n_batch       */ {512},
-    /* f32_kv        */ {false},
+    /* type_k        */ {GGML_TYPE_F16},
+    /* type_v        */ {GGML_TYPE_F16},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
    /* main_gpu      */ {0},
@ -162,7 +171,8 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
    printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    printf("  -ctk <t>, --cache-type-k <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv <t>, --cache-type-v <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@ -173,9 +183,32 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
-
 }

+static ggml_type ggml_type_from_name(const std::string & s) {
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    if (s == "q4_0") {
+        return GGML_TYPE_Q4_0;
+    }
+    if (s == "q4_1") {
+        return GGML_TYPE_Q4_1;
+    }
+    if (s == "q5_0") {
+        return GGML_TYPE_Q5_0;
+    }
+    if (s == "q5_1") {
+        return GGML_TYPE_Q5_1;
+    }
+
+    return GGML_TYPE_COUNT;
+}
+
+
 static cmd_params parse_cmd_params(int argc, char ** argv) {
    cmd_params params;
    std::string arg;
@ -224,13 +257,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "--memory-f32") {
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
-            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+            auto p = split<std::string>(argv[i], split_delim);
+            std::vector<ggml_type> types;
+            for (const auto & t : p) {
+                ggml_type gt = ggml_type_from_name(t);
+                if (gt == GGML_TYPE_COUNT) {
+                    invalid_param = true;
+                    break;
+                }
+                types.push_back(gt);
+            }
+            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
+        } else if (arg == "-ctv" || arg == "--cache-type-v") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<std::string>(argv[i], split_delim);
+            std::vector<ggml_type> types;
+            for (const auto & t : p) {
+                ggml_type gt = ggml_type_from_name(t);
+                if (gt == GGML_TYPE_COUNT) {
+                    invalid_param = true;
+                    break;
+                }
+                types.push_back(gt);
+            }
+            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
@ -321,7 +379,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
+    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@ -336,7 +395,8 @@ struct cmd_params_instance {
    int n_prompt;
    int n_gen;
    int n_batch;
-    bool f32_kv;
+    ggml_type type_k;
+    ggml_type type_v;
    int n_threads;
    int n_gpu_layers;
    int main_gpu;
@ -365,7 +425,8 @@ struct cmd_params_instance {

        cparams.n_ctx = n_prompt + n_gen;
        cparams.n_batch = n_batch;
-        cparams.f16_kv = !f32_kv;
+        cparams.type_k = type_k;
+        cparams.type_v = type_v;
        cparams.mul_mat_q = mul_mat_q;

        return cparams;
@ -380,7 +441,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
+    for (const auto & tv : params.type_v)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
@ -388,7 +450,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
            /* .n_prompt     = */ n_prompt,
            /* .n_gen        = */ n_gen,
            /* .n_batch      = */ nb,
-            /* .f32_kv       = */ fk,
+            /* .type_k       = */ tk,
+            /* .type_v       = */ tv,
            /* .n_threads    = */ nt,
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
@ -410,7 +473,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
+    for (const auto & tv : params.type_v)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
@ -422,7 +486,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
@ -441,7 +506,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
+                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
@ -489,7 +555,8 @@ struct test {
    uint64_t model_n_params;
    int n_batch;
    int n_threads;
-    bool f32_kv;
+    ggml_type type_k;
+    ggml_type type_v;
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
@ -508,7 +575,8 @@ struct test {
        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
-        f32_kv = inst.f32_kv;
+        type_k = inst.type_k;
+        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
        main_gpu = inst.main_gpu;
        mul_mat_q = inst.mul_mat_q;
@ -571,7 +639,7 @@ struct test {
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "f16_kv",
+            "n_batch", "n_threads", "type_k", "type_v",
            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
@ -621,7 +689,7 @@ struct test {
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
@ -805,8 +873,11 @@ struct markdown_printer : public printer {
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
            fields.push_back("n_batch");
        }
-        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
-            fields.push_back("f16_kv");
+        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
+            fields.push_back("type_k");
+        }
+        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
+            fields.push_back("type_v");
        }
        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
            fields.push_back("main_gpu");
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -6,14 +6,34 @@ enum LlamaError: Error {
    case couldNotInitializeContext
 }

+func llama_batch_clear(_ batch: inout llama_batch) {
+    batch.n_tokens = 0
+}
+
+func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
+    batch.token   [Int(batch.n_tokens)] = id
+    batch.pos     [Int(batch.n_tokens)] = pos
+    batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
+    for i in 0..<seq_ids.count {
+        batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
+    }
+    batch.logits  [Int(batch.n_tokens)] = logits ? 1 : 0
+
+    batch.n_tokens += 1
+}
+
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]

-    var n_len: Int32 = 512
+    /// This variable is used to store temporarily invalid cchars
+    private var temporary_invalid_cchars: [CChar]
+
+    var n_len: Int32 = 64
    var n_cur: Int32 = 0
+
    var n_decode: Int32 = 0

    init(model: OpaquePointer, context: OpaquePointer) {
@ -21,28 +41,38 @@ actor LlamaContext {
        self.context = context
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
+        self.temporary_invalid_cchars = []
    }

    deinit {
+        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
        llama_backend_free()
    }

-    static func createContext(path: String) throws -> LlamaContext {
+    static func create_context(path: String) throws -> LlamaContext {
        llama_backend_init(false)
-        let model_params = llama_model_default_params()
+        var model_params = llama_model_default_params()

+#if targetEnvironment(simulator)
+        model_params.n_gpu_layers = 0
+        print("Running on simulator, force use n_gpu_layers = 0")
+#endif
        let model = llama_load_model_from_file(path, model_params)
        guard let model else {
            print("Could not load model at \(path)")
            throw LlamaError.couldNotInitializeContext
        }
+
+        let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
+        print("Using \(n_threads) threads")
+
        var ctx_params = llama_context_default_params()
-        ctx_params.seed = 1234
+        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
-        ctx_params.n_threads = 8
-        ctx_params.n_threads_batch = 8
+        ctx_params.n_threads       = UInt32(n_threads)
+        ctx_params.n_threads_batch = UInt32(n_threads)

        let context = llama_new_context_with_model(model, ctx_params)
        guard let context else {
@ -53,6 +83,26 @@ actor LlamaContext {
        return LlamaContext(model: model, context: context)
    }

+    func model_info() -> String {
+        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
+        result.initialize(repeating: Int8(0), count: 256)
+        defer {
+            result.deallocate()
+        }
+
+        // TODO: this is probably very stupid way to get the string from C
+
+        let nChars = llama_model_desc(model, result, 256)
+        let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
+
+        var SwiftString = ""
+        for char in bufferPointer {
+            SwiftString.append(Character(UnicodeScalar(UInt8(char))))
+        }
+
+        return SwiftString
+    }
+
    func get_n_tokens() -> Int32 {
        return batch.n_tokens;
    }
@ -61,6 +111,7 @@ actor LlamaContext {
        print("attempting to complete \"\(text)\"")

        tokens_list = tokenize(text: text, add_bos: true)
+        temporary_invalid_cchars = []

        let n_ctx = llama_n_ctx(context)
        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
@ -72,19 +123,14 @@ actor LlamaContext {
        }

        for id in tokens_list {
-            print(token_to_piece(token: id))
+            print(String(cString: token_to_piece(token: id) + [0]))
        }

-        // batch = llama_batch_init(512, 0) // done in init()
-        batch.n_tokens = Int32(tokens_list.count)
+        llama_batch_clear(&batch)

-        for i1 in 0..<batch.n_tokens {
+        for i1 in 0..<tokens_list.count {
            let i = Int(i1)
-            batch.token[i] = tokens_list[i]
-            batch.pos[i] = i1
-            batch.n_seq_id[Int(i)] = 1
-            batch.seq_id[Int(i)]![0] = 0
-            batch.logits[i] = 0
+            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
        }
        batch.logits[Int(batch.n_tokens) - 1] = 1 // true

@ -115,25 +161,33 @@ actor LlamaContext {

        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            print("\n")
-            return ""
+            let new_token_str = String(cString: temporary_invalid_cchars + [0])
+            temporary_invalid_cchars.removeAll()
+            return new_token_str
        }

-        let new_token_str = token_to_piece(token: new_token_id)
+        let new_token_cchars = token_to_piece(token: new_token_id)
+        temporary_invalid_cchars.append(contentsOf: new_token_cchars)
+        let new_token_str: String
+        if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
+            temporary_invalid_cchars.removeAll()
+            new_token_str = string
+        } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
+            // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
+            let string = String(cString: temporary_invalid_cchars + [0])
+            temporary_invalid_cchars.removeAll()
+            new_token_str = string
+        } else {
+            new_token_str = ""
+        }
        print(new_token_str)
        // tokens_list.append(new_token_id)

-        batch.n_tokens = 0
-
-        batch.token[Int(batch.n_tokens)] = new_token_id
-        batch.pos[Int(batch.n_tokens)] = n_cur
-        batch.n_seq_id[Int(batch.n_tokens)] = 1
-        batch.seq_id[Int(batch.n_tokens)]![0] = 0
-        batch.logits[Int(batch.n_tokens)] = 1 // true
-        batch.n_tokens += 1
+        llama_batch_clear(&batch)
+        llama_batch_add(&batch, new_token_id, n_cur, [0], true)

        n_decode += 1
-
-        n_cur += 1
+        n_cur    += 1

        if llama_decode(context, batch) != 0 {
            print("failed to evaluate llama!")
@ -142,14 +196,113 @@ actor LlamaContext {
        return new_token_str
    }

+    func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
+        var pp_avg: Double = 0
+        var tg_avg: Double = 0
+
+        var pp_std: Double = 0
+        var tg_std: Double = 0
+
+        for _ in 0..<nr {
+            // bench prompt processing
+
+            llama_batch_clear(&batch)
+
+            let n_tokens = pp
+
+            for i in 0..<n_tokens {
+                llama_batch_add(&batch, 0, Int32(i), [0], false)
+            }
+            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
+
+            llama_kv_cache_clear(context)
+
+            let t_pp_start = ggml_time_us()
+
+            if llama_decode(context, batch) != 0 {
+                print("llama_decode() failed during prompt")
+            }
+
+            let t_pp_end = ggml_time_us()
+
+            // bench text generation
+
+            llama_kv_cache_clear(context)
+
+            let t_tg_start = ggml_time_us()
+
+            for i in 0..<tg {
+                llama_batch_clear(&batch)
+
+                for j in 0..<pl {
+                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
+                }
+
+                if llama_decode(context, batch) != 0 {
+                    print("llama_decode() failed during text generation")
+                }
+            }
+
+            let t_tg_end = ggml_time_us()
+
+            llama_kv_cache_clear(context)
+
+            let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
+            let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
+
+            let speed_pp = Double(pp)    / t_pp
+            let speed_tg = Double(pl*tg) / t_tg
+
+            pp_avg += speed_pp
+            tg_avg += speed_tg
+
+            pp_std += speed_pp * speed_pp
+            tg_std += speed_tg * speed_tg
+
+            print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
+        }
+
+        pp_avg /= Double(nr)
+        tg_avg /= Double(nr)
+
+        if nr > 1 {
+            pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
+            tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
+        } else {
+            pp_std = 0
+            tg_std = 0
+        }
+
+        let model_desc     = model_info();
+        let model_size     = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
+        let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
+        let backend        = "Metal";
+        let pp_avg_str     = String(format: "%.2f", pp_avg);
+        let tg_avg_str     = String(format: "%.2f", tg_avg);
+        let pp_std_str     = String(format: "%.2f", pp_std);
+        let tg_std_str     = String(format: "%.2f", tg_std);
+
+        var result = ""
+
+        result += String("| model | size | params | backend | test | t/s |\n")
+        result += String("| --- | --- | --- | --- | --- | --- |\n")
+        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
+        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
+
+        return result;
+    }
+
    func clear() {
        tokens_list.removeAll()
+        temporary_invalid_cchars.removeAll()
+        llama_kv_cache_clear(context)
    }

    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-        let n_tokens = text.count + (add_bos ? 1 : 0)
+        let utf8Count = text.utf8.count
+        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)

        var swiftTokens: [llama_token] = []
        for i in 0..<tokenCount {
@ -161,7 +314,8 @@ actor LlamaContext {
        return swiftTokens
    }

-    private func token_to_piece(token: llama_token) -> String {
+    /// - note: The result does not contain null-terminator
+    private func token_to_piece(token: llama_token) -> [CChar] {
        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
        result.initialize(repeating: Int8(0), count: 8)
        defer {
@ -175,10 +329,12 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            _ = llama_token_to_piece(model, token, newResult, -nTokens)
-            return String(cString: newResult)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
+            return Array(bufferPointer)
        } else {
-            return String(cString: result)
+            let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
+            return Array(bufferPointer)
        }
    }
 }
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@ -1,481 +1,483 @@
 // !$*UTF8*$!
 {
-    archiveVersion = 1;
-    classes = {
-    };
-    objectVersion = 56;
-    objects = {
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 56;
+	objects = {

 /* Begin PBXBuildFile section */
-        542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
-        5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
-        542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
-        542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
-        542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
-        542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
-        549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
-        549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
-        8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
-        8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
-        8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
-        8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
-        8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
-        8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
-        8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
-        8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+		542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
+		5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
+		542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
+		542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
+		542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
+		542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
+		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
+		549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
+		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
+		8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
+		8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
+		8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
+		8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
+		8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
+		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
+		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
+		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
 /* End PBXBuildFile section */

 /* Begin PBXFileReference section */
-        542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
-        542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
-        542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
-        5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
-        542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
-        542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
-        542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
-        542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
-        542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
-        542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
-        549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
-        549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
-        549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
-        549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
-        8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
-        8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
-        8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
-        8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
-        8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-        8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
-        8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-        8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
-        8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
-        8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
-        8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
+		542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
+		542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
+		542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
+		5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
+		542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
+		542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
+		542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
+		542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
+		542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
+		542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
+		549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
+		549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
+		549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
+		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
+		7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
+		8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
+		8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
+		8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+		8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
+		8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
+		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
+		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
-        8A1C83702AC328BD0096AF73 /* Frameworks */ = {
-            isa = PBXFrameworksBuildPhase;
-            buildActionMask = 2147483647;
-            files = (
-                549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
-                8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
-            );
-            runOnlyForDeploymentPostprocessing = 0;
-        };
+		8A1C83702AC328BD0096AF73 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
+				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
 /* End PBXFrameworksBuildPhase section */

 /* Begin PBXGroup section */
-        8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
-            isa = PBXGroup;
-            children = (
-                5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
-                542376092B0D9C40008E6A1C /* ggml-backend.h */,
-                542376062B0D9BEA008E6A1C /* ggml-quants.h */,
-                542376072B0D9BFB008E6A1C /* ggml-quants.c */,
-                549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
-                549479C62AC9E0F200E0F78B /* ggml-metal.h */,
-                549479C52AC9E0F200E0F78B /* ggml-metal.m */,
-                542EA09B2AC8723900A8AEE9 /* ggml.c */,
-                542EA09C2AC8723900A8AEE9 /* ggml.h */,
-                542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
-                542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
-                542EA0A12AC8729100A8AEE9 /* llama.cpp */,
-                542EA0A22AC8729100A8AEE9 /* llama.h */,
-            );
-            name = llama.cpp;
-            sourceTree = "<group>";
-        };
-        8A1C836A2AC328BD0096AF73 = {
-            isa = PBXGroup;
-            children = (
-                8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
-                8A907F312AC7134E006146EA /* llama.cpp.swift */,
-                8A3F84232AC4C891005E2EE8 /* models */,
-                8A1C83752AC328BD0096AF73 /* llama.swiftui */,
-                8A1C83742AC328BD0096AF73 /* Products */,
-                8A39BE082AC7601000BFEB40 /* Frameworks */,
-            );
-            sourceTree = "<group>";
-        };
-        8A1C83742AC328BD0096AF73 /* Products */ = {
-            isa = PBXGroup;
-            children = (
-                8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
-            );
-            name = Products;
-            sourceTree = "<group>";
-        };
-        8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
-            isa = PBXGroup;
-            children = (
-                8A3F84102AC4BD85005E2EE8 /* Resources */,
-                8A9F7C4B2AC332DC008AE1EA /* Models */,
-                8A9F7C4A2AC332BF008AE1EA /* UI */,
-                8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
-                8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
-                8A1C837C2AC328BE0096AF73 /* Preview Content */,
-            );
-            path = llama.swiftui;
-            sourceTree = "<group>";
-        };
-        8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
-            isa = PBXGroup;
-            children = (
-                8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
-            );
-            path = "Preview Content";
-            sourceTree = "<group>";
-        };
-        8A39BE082AC7601000BFEB40 /* Frameworks */ = {
-            isa = PBXGroup;
-            children = (
-                549479CA2AC9E16000E0F78B /* Metal.framework */,
-                8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
-            );
-            name = Frameworks;
-            sourceTree = "<group>";
-        };
-        8A3F84102AC4BD85005E2EE8 /* Resources */ = {
-            isa = PBXGroup;
-            children = (
-                8A3F84112AC4BD8C005E2EE8 /* models */,
-            );
-            path = Resources;
-            sourceTree = "<group>";
-        };
-        8A3F84112AC4BD8C005E2EE8 /* models */ = {
-            isa = PBXGroup;
-            children = (
-                8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
-            );
-            path = models;
-            sourceTree = "<group>";
-        };
-        8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
-            isa = PBXGroup;
-            children = (
-                8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
-                8A907F322AC7134E006146EA /* LibLlama.swift */,
-            );
-            path = llama.cpp.swift;
-            sourceTree = "<group>";
-        };
-        8A9F7C4A2AC332BF008AE1EA /* UI */ = {
-            isa = PBXGroup;
-            children = (
-                8A1C83782AC328BD0096AF73 /* ContentView.swift */,
-            );
-            path = UI;
-            sourceTree = "<group>";
-        };
-        8A9F7C4B2AC332DC008AE1EA /* Models */ = {
-            isa = PBXGroup;
-            children = (
-                8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
-            );
-            path = Models;
-            sourceTree = "<group>";
-        };
+		8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
+			isa = PBXGroup;
+			children = (
+				5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
+				542376092B0D9C40008E6A1C /* ggml-backend.h */,
+				542376062B0D9BEA008E6A1C /* ggml-quants.h */,
+				542376072B0D9BFB008E6A1C /* ggml-quants.c */,
+				549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
+				549479C62AC9E0F200E0F78B /* ggml-metal.h */,
+				549479C52AC9E0F200E0F78B /* ggml-metal.m */,
+				542EA09B2AC8723900A8AEE9 /* ggml.c */,
+				542EA09C2AC8723900A8AEE9 /* ggml.h */,
+				542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
+				542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
+				542EA0A12AC8729100A8AEE9 /* llama.cpp */,
+				542EA0A22AC8729100A8AEE9 /* llama.h */,
+			);
+			name = llama.cpp;
+			sourceTree = "<group>";
+		};
+		8A1C836A2AC328BD0096AF73 = {
+			isa = PBXGroup;
+			children = (
+				8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
+				8A907F312AC7134E006146EA /* llama.cpp.swift */,
+				8A3F84232AC4C891005E2EE8 /* models */,
+				8A1C83752AC328BD0096AF73 /* llama.swiftui */,
+				8A1C83742AC328BD0096AF73 /* Products */,
+				8A39BE082AC7601000BFEB40 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		8A1C83742AC328BD0096AF73 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
+			isa = PBXGroup;
+			children = (
+				8A3F84102AC4BD85005E2EE8 /* Resources */,
+				8A9F7C4B2AC332DC008AE1EA /* Models */,
+				8A9F7C4A2AC332BF008AE1EA /* UI */,
+				8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
+				8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
+				8A1C837C2AC328BE0096AF73 /* Preview Content */,
+			);
+			path = llama.swiftui;
+			sourceTree = "<group>";
+		};
+		8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
+			isa = PBXGroup;
+			children = (
+				8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
+			);
+			path = "Preview Content";
+			sourceTree = "<group>";
+		};
+		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				549479CA2AC9E16000E0F78B /* Metal.framework */,
+				8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		8A3F84102AC4BD85005E2EE8 /* Resources */ = {
+			isa = PBXGroup;
+			children = (
+				8A3F84112AC4BD8C005E2EE8 /* models */,
+			);
+			path = Resources;
+			sourceTree = "<group>";
+		};
+		8A3F84112AC4BD8C005E2EE8 /* models */ = {
+			isa = PBXGroup;
+			children = (
+			);
+			path = models;
+			sourceTree = "<group>";
+		};
+		8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
+			isa = PBXGroup;
+			children = (
+				8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
+				8A907F322AC7134E006146EA /* LibLlama.swift */,
+			);
+			path = llama.cpp.swift;
+			sourceTree = "<group>";
+		};
+		8A9F7C4A2AC332BF008AE1EA /* UI */ = {
+			isa = PBXGroup;
+			children = (
+				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
+				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
+			);
+			path = UI;
+			sourceTree = "<group>";
+		};
+		8A9F7C4B2AC332DC008AE1EA /* Models */ = {
+			isa = PBXGroup;
+			children = (
+				8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
+			);
+			path = Models;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */

 /* Begin PBXNativeTarget section */
-        8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
-            isa = PBXNativeTarget;
-            buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
-            buildPhases = (
-                8A1C836F2AC328BD0096AF73 /* Sources */,
-                8A1C83702AC328BD0096AF73 /* Frameworks */,
-                8A1C83712AC328BD0096AF73 /* Resources */,
-            );
-            buildRules = (
-            );
-            dependencies = (
-            );
-            name = llama.swiftui;
-            packageProductDependencies = (
-            );
-            productName = llama.swiftui;
-            productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
-            productType = "com.apple.product-type.application";
-        };
+		8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
+			buildPhases = (
+				8A1C836F2AC328BD0096AF73 /* Sources */,
+				8A1C83702AC328BD0096AF73 /* Frameworks */,
+				8A1C83712AC328BD0096AF73 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = llama.swiftui;
+			packageProductDependencies = (
+			);
+			productName = llama.swiftui;
+			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
+			productType = "com.apple.product-type.application";
+		};
 /* End PBXNativeTarget section */

 /* Begin PBXProject section */
-        8A1C836B2AC328BD0096AF73 /* Project object */ = {
-            isa = PBXProject;
-            attributes = {
-                BuildIndependentTargetsInParallel = 1;
-                LastSwiftUpdateCheck = 1500;
-                LastUpgradeCheck = 1500;
-                TargetAttributes = {
-                    8A1C83722AC328BD0096AF73 = {
-                        CreatedOnToolsVersion = 15.0;
-                        LastSwiftMigration = 1500;
-                    };
-                };
-            };
-            buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
-            compatibilityVersion = "Xcode 14.0";
-            developmentRegion = en;
-            hasScannedForEncodings = 0;
-            knownRegions = (
-                en,
-                Base,
-            );
-            mainGroup = 8A1C836A2AC328BD0096AF73;
-            packageReferences = (
-            );
-            productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
-            projectDirPath = "";
-            projectRoot = "";
-            targets = (
-                8A1C83722AC328BD0096AF73 /* llama.swiftui */,
-            );
-        };
+		8A1C836B2AC328BD0096AF73 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				BuildIndependentTargetsInParallel = 1;
+				LastSwiftUpdateCheck = 1500;
+				LastUpgradeCheck = 1500;
+				TargetAttributes = {
+					8A1C83722AC328BD0096AF73 = {
+						CreatedOnToolsVersion = 15.0;
+						LastSwiftMigration = 1500;
+					};
+				};
+			};
+			buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
+			compatibilityVersion = "Xcode 14.0";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 8A1C836A2AC328BD0096AF73;
+			packageReferences = (
+			);
+			productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				8A1C83722AC328BD0096AF73 /* llama.swiftui */,
+			);
+		};
 /* End PBXProject section */

 /* Begin PBXResourcesBuildPhase section */
-        8A1C83712AC328BD0096AF73 /* Resources */ = {
-            isa = PBXResourcesBuildPhase;
-            buildActionMask = 2147483647;
-            files = (
-                542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
-                8A3F84242AC4C891005E2EE8 /* models in Resources */,
-                8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
-                8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
-            );
-            runOnlyForDeploymentPostprocessing = 0;
-        };
+		8A1C83712AC328BD0096AF73 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
+				8A3F84242AC4C891005E2EE8 /* models in Resources */,
+				8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
+				8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
 /* End PBXResourcesBuildPhase section */

 /* Begin PBXSourcesBuildPhase section */
-        8A1C836F2AC328BD0096AF73 /* Sources */ = {
-            isa = PBXSourcesBuildPhase;
-            buildActionMask = 2147483647;
-            files = (
-                542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
-                549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
-                542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
-                8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
-                542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
-                8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
-                8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
-                8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
-                542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
-                5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
-            );
-            runOnlyForDeploymentPostprocessing = 0;
-        };
+		8A1C836F2AC328BD0096AF73 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
+				549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
+				542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
+				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
+				542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
+				8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
+				8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
+				8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
+				7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
+				542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
+				5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
 /* End PBXSourcesBuildPhase section */

 /* Begin XCBuildConfiguration section */
-        8A1C837F2AC328BE0096AF73 /* Debug */ = {
-            isa = XCBuildConfiguration;
-            buildSettings = {
-                ALWAYS_SEARCH_USER_PATHS = NO;
-                ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-                CLANG_ANALYZER_NONNULL = YES;
-                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-                CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-                CLANG_ENABLE_MODULES = YES;
-                CLANG_ENABLE_OBJC_ARC = YES;
-                CLANG_ENABLE_OBJC_WEAK = YES;
-                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-                CLANG_WARN_BOOL_CONVERSION = YES;
-                CLANG_WARN_COMMA = YES;
-                CLANG_WARN_CONSTANT_CONVERSION = YES;
-                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-                CLANG_WARN_EMPTY_BODY = YES;
-                CLANG_WARN_ENUM_CONVERSION = YES;
-                CLANG_WARN_INFINITE_RECURSION = YES;
-                CLANG_WARN_INT_CONVERSION = YES;
-                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-                CLANG_WARN_STRICT_PROTOTYPES = YES;
-                CLANG_WARN_SUSPICIOUS_MOVE = YES;
-                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-                CLANG_WARN_UNREACHABLE_CODE = YES;
-                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-                COPY_PHASE_STRIP = NO;
-                DEBUG_INFORMATION_FORMAT = dwarf;
-                ENABLE_STRICT_OBJC_MSGSEND = YES;
-                ENABLE_TESTABILITY = YES;
-                ENABLE_USER_SCRIPT_SANDBOXING = YES;
-                GCC_C_LANGUAGE_STANDARD = gnu17;
-                GCC_DYNAMIC_NO_PIC = NO;
-                GCC_NO_COMMON_BLOCKS = YES;
-                GCC_OPTIMIZATION_LEVEL = 0;
-                GCC_PREPROCESSOR_DEFINITIONS = (
-                    "DEBUG=1",
-                    "$(inherited)",
-                );
-                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-                GCC_WARN_UNDECLARED_SELECTOR = YES;
-                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-                GCC_WARN_UNUSED_FUNCTION = YES;
-                GCC_WARN_UNUSED_VARIABLE = YES;
-                IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-                LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-                MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-                MTL_FAST_MATH = YES;
-                ONLY_ACTIVE_ARCH = YES;
-                SDKROOT = iphoneos;
-                SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
-                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-            };
-            name = Debug;
-        };
-        8A1C83802AC328BE0096AF73 /* Release */ = {
-            isa = XCBuildConfiguration;
-            buildSettings = {
-                ALWAYS_SEARCH_USER_PATHS = NO;
-                ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-                CLANG_ANALYZER_NONNULL = YES;
-                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-                CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-                CLANG_ENABLE_MODULES = YES;
-                CLANG_ENABLE_OBJC_ARC = YES;
-                CLANG_ENABLE_OBJC_WEAK = YES;
-                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-                CLANG_WARN_BOOL_CONVERSION = YES;
-                CLANG_WARN_COMMA = YES;
-                CLANG_WARN_CONSTANT_CONVERSION = YES;
-                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-                CLANG_WARN_EMPTY_BODY = YES;
-                CLANG_WARN_ENUM_CONVERSION = YES;
-                CLANG_WARN_INFINITE_RECURSION = YES;
-                CLANG_WARN_INT_CONVERSION = YES;
-                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-                CLANG_WARN_STRICT_PROTOTYPES = YES;
-                CLANG_WARN_SUSPICIOUS_MOVE = YES;
-                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-                CLANG_WARN_UNREACHABLE_CODE = YES;
-                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-                COPY_PHASE_STRIP = NO;
-                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-                ENABLE_NS_ASSERTIONS = NO;
-                ENABLE_STRICT_OBJC_MSGSEND = YES;
-                ENABLE_USER_SCRIPT_SANDBOXING = YES;
-                GCC_C_LANGUAGE_STANDARD = gnu17;
-                GCC_NO_COMMON_BLOCKS = YES;
-                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-                GCC_WARN_UNDECLARED_SELECTOR = YES;
-                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-                GCC_WARN_UNUSED_FUNCTION = YES;
-                GCC_WARN_UNUSED_VARIABLE = YES;
-                IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-                LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-                MTL_ENABLE_DEBUG_INFO = NO;
-                MTL_FAST_MATH = YES;
-                SDKROOT = iphoneos;
-                SWIFT_COMPILATION_MODE = wholemodule;
-                VALIDATE_PRODUCT = YES;
-            };
-            name = Release;
-        };
-        8A1C83822AC328BE0096AF73 /* Debug */ = {
-            isa = XCBuildConfiguration;
-            buildSettings = {
-                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-                CLANG_ENABLE_MODULES = YES;
-                CODE_SIGN_STYLE = Automatic;
-                CURRENT_PROJECT_VERSION = 1;
-                DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
-                DEVELOPMENT_TEAM = STLSG3FG8Q;
-                ENABLE_PREVIEWS = YES;
-                GENERATE_INFOPLIST_FILE = YES;
-                INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-                INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-                INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-                LD_RUNPATH_SEARCH_PATHS = (
-                    "$(inherited)",
-                    "@executable_path/Frameworks",
-                );
-                MARKETING_VERSION = 1.0;
-                PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-                PRODUCT_NAME = "$(TARGET_NAME)";
-                SWIFT_EMIT_LOC_STRINGS = YES;
-                SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
-                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-                SWIFT_VERSION = 5.0;
-                TARGETED_DEVICE_FAMILY = "1,2";
-            };
-            name = Debug;
-        };
-        8A1C83832AC328BE0096AF73 /* Release */ = {
-            isa = XCBuildConfiguration;
-            buildSettings = {
-                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-                CLANG_ENABLE_MODULES = YES;
-                CODE_SIGN_STYLE = Automatic;
-                CURRENT_PROJECT_VERSION = 1;
-                DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
-                DEVELOPMENT_TEAM = STLSG3FG8Q;
-                ENABLE_PREVIEWS = YES;
-                GENERATE_INFOPLIST_FILE = YES;
-                INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-                INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-                INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-                LD_RUNPATH_SEARCH_PATHS = (
-                    "$(inherited)",
-                    "@executable_path/Frameworks",
-                );
-                MARKETING_VERSION = 1.0;
-                PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-                PRODUCT_NAME = "$(TARGET_NAME)";
-                SWIFT_EMIT_LOC_STRINGS = YES;
-                SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
-                SWIFT_VERSION = 5.0;
-                TARGETED_DEVICE_FAMILY = "1,2";
-            };
-            name = Release;
-        };
+		8A1C837F2AC328BE0096AF73 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		8A1C83802AC328BE0096AF73 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		8A1C83822AC328BE0096AF73 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
+				DEVELOPMENT_TEAM = STLSG3FG8Q;
+				ENABLE_PREVIEWS = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		8A1C83832AC328BE0096AF73 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
+				DEVELOPMENT_TEAM = STLSG3FG8Q;
+				ENABLE_PREVIEWS = YES;
+				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
 /* End XCBuildConfiguration section */

 /* Begin XCConfigurationList section */
-        8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
-            isa = XCConfigurationList;
-            buildConfigurations = (
-                8A1C837F2AC328BE0096AF73 /* Debug */,
-                8A1C83802AC328BE0096AF73 /* Release */,
-            );
-            defaultConfigurationIsVisible = 0;
-            defaultConfigurationName = Release;
-        };
-        8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
-            isa = XCConfigurationList;
-            buildConfigurations = (
-                8A1C83822AC328BE0096AF73 /* Debug */,
-                8A1C83832AC328BE0096AF73 /* Release */,
-            );
-            defaultConfigurationIsVisible = 0;
-            defaultConfigurationName = Release;
-        };
+		8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				8A1C837F2AC328BE0096AF73 /* Debug */,
+				8A1C83802AC328BE0096AF73 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				8A1C83822AC328BE0096AF73 /* Debug */,
+				8A1C83832AC328BE0096AF73 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
 /* End XCConfigurationList section */
-    };
-    rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
+	};
+	rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
 }
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -3,24 +3,26 @@ import Foundation
@MainActor
 class LlamaState: ObservableObject {
    @Published var messageLog = ""
+    @Published var cacheCleared = false

    private var llamaContext: LlamaContext?
-    private var modelUrl: URL? {
-        Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
+    private var defaultModelUrl: URL? {
+        Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
        // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
    }
+
    init() {
        do {
-            try loadModel()
+            try loadModel(modelUrl: defaultModelUrl)
        } catch {
            messageLog += "Error!\n"
        }
    }

-    private func loadModel() throws {
+    func loadModel(modelUrl: URL?) throws {
        messageLog += "Loading model...\n"
        if let modelUrl {
-            llamaContext = try LlamaContext.createContext(path: modelUrl.path())
+            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
        } else {
            messageLog += "Could not locate model\n"
@ -31,7 +33,7 @@ class LlamaState: ObservableObject {
        guard let llamaContext else {
            return
        }
-        messageLog += "Attempting to complete text...\n"
+
        await llamaContext.completion_init(text: text)
        messageLog += "\(text)"

@ -42,4 +44,42 @@ class LlamaState: ObservableObject {
        await llamaContext.clear()
        messageLog += "\n\ndone\n"
    }
+
+    func bench() async {
+        guard let llamaContext else {
+            return
+        }
+
+        messageLog += "\n"
+        messageLog += "Running benchmark...\n"
+        messageLog += "Model info: "
+        messageLog += await llamaContext.model_info() + "\n"
+
+        let t_start = DispatchTime.now().uptimeNanoseconds
+        await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
+        let t_end = DispatchTime.now().uptimeNanoseconds
+
+        let t_heat = Double(t_end - t_start) / 1_000_000_000.0
+        messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
+
+        // if more than 5 seconds, then we're probably running on a slow device
+        if t_heat > 5.0 {
+            messageLog += "Heat up time is too long, aborting benchmark\n"
+            return
+        }
+
+        let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
+
+        messageLog += "\(result)"
+        messageLog += "\n"
+    }
+
+    func clear() async {
+        guard let llamaContext else {
+            return
+        }
+
+        await llamaContext.clear()
+        messageLog = ""
+    }
 }
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@ -5,24 +5,132 @@ struct ContentView: View {

    @State private var multiLineText = ""

+    private static func cleanupModelCaches() {
+        // Delete all models (*.gguf)
+        let fileManager = FileManager.default
+        let documentsUrl =  FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
+        do {
+            let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
+            for fileURL in fileURLs {
+                if fileURL.pathExtension == "gguf" {
+                    try fileManager.removeItem(at: fileURL)
+                }
+            }
+        } catch {
+            print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
+        }
+    }
+
    var body: some View {
        VStack {
-            ScrollView(.vertical) {
+            ScrollView(.vertical, showsIndicators: true) {
                Text(llamaState.messageLog)
+                .font(.system(size: 12))
+                .frame(maxWidth: .infinity, alignment: .leading)
+                .padding()
+                .onTapGesture {
+                    UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
+                }
            }

            TextEditor(text: $multiLineText)
-                .frame(height: 200)
+                .frame(height: 80)
                .padding()
                .border(Color.gray, width: 0.5)
-            Button(action: {
-                sendText()
-            }) {
-                Text("Send")
-                    .padding()
-                    .background(Color.blue)
-                    .foregroundColor(.white)
-                    .cornerRadius(8)
+
+            HStack {
+                Button("Send") {
+                    sendText()
+                }
+                .padding(8)
+                .background(Color.blue)
+                .foregroundColor(.white)
+                .cornerRadius(8)
+
+                Button("Bench") {
+                    bench()
+                }
+                .padding(8)
+                .background(Color.blue)
+                .foregroundColor(.white)
+                .cornerRadius(8)
+
+                Button("Clear") {
+                    clear()
+                }
+                .padding(8)
+                .background(Color.blue)
+                .foregroundColor(.white)
+                .cornerRadius(8)
+
+                Button("Copy") {
+                    UIPasteboard.general.string = llamaState.messageLog
+                }
+                .padding(8)
+                .background(Color.blue)
+                .foregroundColor(.white)
+                .cornerRadius(8)
+            }
+
+            VStack {
+                DownloadButton(
+                    llamaState: llamaState,
+                    modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
+                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
+                    filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
+                )
+                .font(.system(size: 12))
+                .padding(.top, 4)
+                .frame(maxWidth: .infinity, alignment: .leading)
+
+                DownloadButton(
+                    llamaState: llamaState,
+                    modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
+                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
+                    filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
+                )
+                .font(.system(size: 12))
+
+                DownloadButton(
+                    llamaState: llamaState,
+                    modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
+                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
+                    filename: "tinyllama-1.1b-f16.gguf"
+                )
+                .font(.system(size: 12))
+                .frame(maxWidth: .infinity, alignment: .leading)
+
+                DownloadButton(
+                    llamaState: llamaState,
+                    modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
+                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
+                    filename: "phi-2-q4_0.gguf"
+                )
+                .font(.system(size: 12))
+
+                DownloadButton(
+                    llamaState: llamaState,
+                    modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
+                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
+                    filename: "phi-2-q8_0.gguf"
+                )
+                .font(.system(size: 12))
+                .frame(maxWidth: .infinity, alignment: .leading)
+
+                DownloadButton(
+                    llamaState: llamaState,
+                    modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
+                    modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
+                    filename: "mistral-7b-v0.1.Q4_0.gguf"
+                )
+                .font(.system(size: 12))
+
+                Button("Clear downloaded models") {
+                    ContentView.cleanupModelCaches()
+                    llamaState.cacheCleared = true
+                }
+                .padding(8)
+                .font(.system(size: 12))
            }
        }
        .padding()
@ -34,9 +142,20 @@ struct ContentView: View {
            multiLineText = ""
        }
    }
+
+    func bench() {
+        Task {
+            await llamaState.bench()
+        }
+    }
+
+    func clear() {
+        Task {
+            await llamaState.clear()
+        }
+    }
 }
-/*
-#Preview {
-    ContentView()
-}
-*/
+
+//#Preview {
+//    ContentView()
+//}
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
@ -0,0 +1,122 @@
+import SwiftUI
+
+struct DownloadButton: View {
+    @ObservedObject private var llamaState: LlamaState
+    private var modelName: String
+    private var modelUrl: String
+    private var filename: String
+
+    @State private var status: String
+
+    @State private var downloadTask: URLSessionDownloadTask?
+    @State private var progress = 0.0
+    @State private var observation: NSKeyValueObservation?
+
+    private static func getFileURL(filename: String) -> URL {
+        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
+    }
+
+    private func checkFileExistenceAndUpdateStatus() {
+    }
+
+    init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
+        self.llamaState = llamaState
+        self.modelName = modelName
+        self.modelUrl = modelUrl
+        self.filename = filename
+
+        let fileURL = DownloadButton.getFileURL(filename: filename)
+        status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
+    }
+
+    private func download() {
+        status = "downloading"
+        print("Downloading model \(modelName) from \(modelUrl)")
+        guard let url = URL(string: modelUrl) else { return }
+        let fileURL = DownloadButton.getFileURL(filename: filename)
+
+        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
+            if let error = error {
+                print("Error: \(error.localizedDescription)")
+                return
+            }
+
+            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
+                print("Server error!")
+                return
+            }
+
+            do {
+                if let temporaryURL = temporaryURL {
+                    try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
+                    print("Writing to \(filename) completed")
+
+                    llamaState.cacheCleared = false
+
+                    status = "downloaded"
+                }
+            } catch let err {
+                print("Error: \(err.localizedDescription)")
+            }
+        }
+
+        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
+            self.progress = progress.fractionCompleted
+        }
+
+        downloadTask?.resume()
+    }
+
+    var body: some View {
+        VStack {
+            if status == "download" {
+                Button(action: download) {
+                    Text("Download " + modelName)
+                }
+            } else if status == "downloading" {
+                Button(action: {
+                    downloadTask?.cancel()
+                    status = "download"
+                }) {
+                    Text("\(modelName) (Downloading \(Int(progress * 100))%)")
+                }
+            } else if status == "downloaded" {
+                Button(action: {
+                    let fileURL = DownloadButton.getFileURL(filename: filename)
+                    if !FileManager.default.fileExists(atPath: fileURL.path) {
+                        download()
+                        return
+                    }
+                    do {
+                        try llamaState.loadModel(modelUrl: fileURL)
+                    } catch let err {
+                        print("Error: \(err.localizedDescription)")
+                    }
+                }) {
+                    Text("\(modelName) (Downloaded)")
+                }
+            } else {
+                Text("Unknown status")
+            }
+        }
+        .onDisappear() {
+            downloadTask?.cancel()
+        }
+        .onChange(of: llamaState.cacheCleared) { newValue in
+            if newValue {
+                downloadTask?.cancel()
+                let fileURL = DownloadButton.getFileURL(filename: filename)
+                status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
+            }
+        }
+    }
+}
+
+// #Preview {
+//    DownloadButton(
+//        llamaState: LlamaState(),
+//        modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
+//        modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
+//        filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
+//    )
+// }
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            ctx_size += padded_size;
            if (verbosity >= 3) {
                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
-                       cur->n_dims, cur->name, tensor_size, padded_size, offset);
+                       ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
            }
        }
    }
@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
        temp->ny = longer_side;
        temp->size = 3 * longer_side * longer_side;
        temp->data = new uint8_t[temp->size]();
-        uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
+        uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA

        // fill with background color
        for (size_t i = 0; i < temp->size; i++) {
@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
        }

        // quantize only 2D tensors
-        quantize &= (cur->n_dims == 2);
+        quantize &= (ggml_n_dims(cur) == 2);

        if (quantize) {
            new_type = type;
@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }

-        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }

--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -51,7 +51,7 @@ def bytes_to_unicode():
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@ -1,6 +1,6 @@
 # llama.cpp/examples/lookahead

-Demonstartion of lookahead decoding technique:
+Demonstration of lookahead decoding technique:

 https://lmsys.org/blog/2023-11-21-lookahead-decoding/

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -437,6 +437,7 @@ int main(int argc, char ** argv) {
        }
    }
    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
+    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
        auto cparams = llama_context_default_params();
        cparams.n_ctx      = 256;
        cparams.seed       = 1;
-        cparams.f16_kv     = false;

        ctx = llama_new_context_with_model(model, cparams);

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -63,6 +63,10 @@ server.exe -m models\7B\ggml-model.gguf -c 2048
 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.

+### Web Front End
+
+There are two UIs, the classic one and a new one. The new UI offers, among other things, the possibility to select prompt style templates. Note: This feature may only be temporary and may soon be replaced by a universal solution that is currently being worked on #4216.
+
 ## Testing with CURL

 Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
--- a/examples/server/public/DEFAULT_systemPrompts.js
+++ b/examples/server/public/DEFAULT_systemPrompts.js
--- a/examples/server/public/color-themes/colorthemes.css
+++ b/examples/server/public/color-themes/colorthemes.css
@ -1,7 +1,10 @@
-/* Author: Yazan Agha-Schrader */
-/* Inspiration from OpenAI's Playground platform https://platform.openai.com/playground/ */
+@import url("theme-ketivah.css");
+@import url("theme-polarnight.css");
+@import url("theme-snowstorm.css");
+@import url("theme-beeninorder.css");
+@import url("theme-mangotango.css");

-.theme-playground {
+:root {

    /* ---------- PRIMARY COLORS ----------------- */
    --primary-color-1: hsl(0, 0%,    99.2%);
@ -217,3 +220,152 @@
    --button-tertiary-border: var(--primary-color-2);

    }
+
+/*
+
+.theme-template {
+
+
+    If light theme: should go from bright to darker
+    If dark theme: should go from dark to brighter
+    ideally this should not be anything but steps of
+    gray or slightly variants from it
+
+    --primary-color-1: #2E3440;
+    --primary-color-2: #3B4252;
+    --primary-color-3: #434C5E;
+    --primary-color-4: #4C566A;
+
+
+
+    If light theme: should go from dark to brighter
+    If dark theme: should go from bright to darker
+    ideally this should not be anything but steps of
+    gray or slightly variants from it
+
+    --secondary-color-1: #ECEFF4;
+    --secondary-color-2: #E5E9F0;
+    --secondary-color-3: #D8DEE9;
+    --secondary-color-4: #C8CED9;
+
+
+
+    Choose wisely nuance colors. It is not easy to find
+    4 harmonizing nuance colors. But keep in mind, that
+    only one accent color could work too.
+
+    --theme-nuance-color-1: #8FBCBB;
+    --theme-nuance-color-2: #88C0D0;
+    --theme-nuance-color-3: #81A1C1;
+    --theme-nuance-color-4: #5E81AC;
+
+
+
+    adapt the color red, orange, yellow, green,
+    purple to the 'mood' of your overall design
+    e.g is it low-contrast? vibrant? dynamic? etc
+
+    --theme-red-color:    #BF616A;
+    --theme-orange-color: #D08770;
+    --theme-yellow-color: #EBCB8B;
+    --theme-green-color:  #A3BE8C;
+    --theme-purple-color: #B48EAD;
+
+
+
+NOTE: comment all those line `--- ...` out
+------------------------------------------------
+--background-color-1:
+--background-color-2:
+--background-color-3:
+--background-color-4:
+
+--border-color-1:
+--border-color-2:
+--border-color-3:
+
+--border-focus-color:
+--border-focus-shadow:
+
+--text-color-plain:
+--text-color-subtile-1:
+--text-color-subtile-2:
+
+--code-background-color:
+--code-text-color:
+
+--ui-range-thumb-color:
+--ui-range-thumb-border:
+
+--textarea-border-color:
+
+
+
+-------------------------------------------
+--button-alert-text-hover:
+--button-alert-color-hover:
+--button-alert-border-hover:
+
+--button-alert-text-active:
+--button-alert-color-active:
+--button-alert-border-active:
+
+
+
+----------- PRIMARY -----------------------
+--button should immediately catch the eye--
+
+--button-primary-text:
+--button-primary-color:
+--button-primary-border:
+
+
+---------hover----------
+--button-primary-text-hover:
+--button-primary-color-hover:
+--button-primary-border-hover:
+
+
+---------active---------
+--button-primary-text-active:
+--button-primary-color-active:
+--button-primary-border-active:
+
+
+
+------------ SECONDARY ------------------------
+--button should NOT immediately catch the eye--
+
+--button-secondary-text:
+--button-secondary-color:
+--button-secondary-border:
+
+
+---------hover----------
+--button-secondary-text-hover:
+--button-secondary-color-hover:
+--button-secondary-border-hover:
+
+
+---------active---------
+--button-secondary-text-active:
+--button-secondary-color-active:
+--button-secondary-border-active:
+
+
+
+---------- TERTIARY -----------------------
+---------- disabled buttons ---------------
+--button-tertiary-text:
+--button-tertiary-color:
+--button-tertiary-border:
+
+
+---------hover----------
+--button-tertiary-text:
+--button-tertiary-color:
+--button-tertiary-border:
+
+}
+
+*/
--- a/examples/server/public/color-themes/theme-beeninorder.css
+++ b/examples/server/public/color-themes/theme-beeninorder.css
@ -0,0 +1,226 @@
+/* Author: Yazan Agha-Schrader */
+/* Inspiration was a batman wallpaper that i have on my phone */
+
+.theme-beeninorder {
+
+--primary-color-1:      hsl(202, 11%, 19%);
+--primary-color-2:      hsl(202, 11%, 23%);
+--primary-color-3:      hsl(201, 11%, 28%);
+--primary-color-4:      hsl(201, 11%, 40%);
+
+--secondary-color-1:    hsl(201, 11%, 80%);
+--secondary-color-2:    hsl(201, 11%, 74%);
+--secondary-color-3:    hsl(201, 11%, 67%);
+--secondary-color-4:    hsl(201, 11%, 60%);
+
+
+--theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-3: hsl(44.5, 96.7%, 52.9%);
+--theme-nuance-color-4: hsl(44.5, 96.7%, 52.9%);
+
+
+
+/* ---------- PRIMARY COLORS ----------------- */
+--primary-color-1: hsl(201, 11%, 19%);
+    --primary-color-1-hue: 201;
+    --primary-color-1-saturation: 11%;
+    --primary-color-1-lightness: 19%;
+
+--primary-color-2: hsl(201, 11%, 23%);
+    --primary-color-2-hue: 201;
+    --primary-color-2-saturation: 11%;
+    --primary-color-2-lightness: 23%;
+
+--primary-color-3: hsl(201, 11%, 28%);
+    --primary-color-3-hue: 201;
+    --primary-color-3-saturation: 11%;
+    --primary-color-3-lightness: 28%;
+
+--primary-color-4: hsl(201, 11%, 40%);
+    --primary-color-4-hue: 201;
+    --primary-color-4-saturation: 11%;
+    --primary-color-4-lightness: 40%;
+
+
+
+/* ---------- SECONDARY COLORS --------------- */
+--secondary-color-1: hsl(201, 11%, 80%);
+--secondary-color-1-hue: 201;
+--secondary-color-1-saturation: 11%;
+--secondary-color-1-lightness: 80%;
+
+--secondary-color-2: hsl(201, 11%, 74%);
+--secondary-color-2-hue: 201;
+--secondary-color-2-saturation: 11%;
+--secondary-color-2-lightness: 74%;
+
+--secondary-color-3: hsl(201, 11%, 67%);
+--secondary-color-3-hue: 201;
+--secondary-color-3-saturation: 11%;
+--secondary-color-3-lightness: 67%;
+
+--secondary-color-4: hsl(201, 11%, 60%);
+--secondary-color-4-hue: 201;
+--secondary-color-4-saturation: 11%;
+--secondary-color-4-lightness: 60%;
+
+
+
+/* ----------- NUANCES COLORS ---------------- */
+--theme-nuance-color-1: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-1-hue:             44.5;
+    --theme-nuance-color-1-saturation:      96.7%;
+    --theme-nuance-color-1-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-2-hue:             44.5;
+    --theme-nuance-color-2-saturation:      96.7%;
+    --theme-nuance-color-2-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-3-hue:             44.5;
+    --theme-nuance-color-3-saturation:      96.7%;
+    --theme-nuance-color-3-lightness:       52.9%;
+
+--theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
+    --theme-nuance-color-4-hue:             44.5;
+    --theme-nuance-color-4-saturation:      96.7%;
+    --theme-nuance-color-4-lightness:       52.9%;
+
+
+
+/* ----------- ROYGP COLORS ------------------ */
+    --theme-red-color:     hsl(232, 40%, 45%);
+    --theme-orange-color:  #e76f51;
+    --theme-yellow-color:  #ffd95f;
+    --theme-green-color:   #A3BE8C;
+    --theme-purple-color:  hsl(232, 30%, 40%);
+
+
+
+/* ------------------------------------------- */
+--background-color-1:    var(--primary-color-1);
+--background-color-2:    var(--primary-color-2);
+--background-color-3:    var(--primary-color-3);
+--background-color-4:    var(--primary-color-4);
+
+--border-color-1:        var(--primary-color-2);
+--border-color-2:        var(--primary-color-3);
+--border-color-3:        var(--primary-color-4);
+
+--border-focus-color:    var(--theme-nuance-color-2);
+--border-focus-shadow:   var(--theme-nuance-color-1);
+
+--text-color-plain:      var(--secondary-color-1);
+--text-color-subtile-1:  var(--secondary-color-2);
+--text-color-subtile-2:  var(--secondary-color-3);
+
+--code-background-color: var(--secondary-color-2);
+--code-text-color:       var(--primary-color-2);
+
+--ui-range-thumb-color:  var(--theme-nuance-color-3);
+--ui-range-thumb-border: var(--ui-ranger-thumb-color);
+
+--textarea-border-color: var(--secondary-color-4);
+
+
+
+/* ------------------------------------------- */
+--button-alert-text-hover:       var(--secondary-color-1);
+--button-alert-color-hover:      var(--theme-purple-color);
+--button-alert-border-hover:     var(--theme-purple-color);
+
+--button-alert-text-active:      var(--secondary-color-1);
+--button-alert-color-active:     var(--theme-red-color);
+--button-alert-border-active:    var(--theme-red-color);
+
+
+
+/* ----------- PRIMARY BUTTONS --------------- */
+/* - button should immediately catch the eye - */
+--button-primary-text:   var(--primary-color-1);
+--button-primary-color:  var(--theme-nuance-color-3);
+--button-primary-border: var(--theme-nuance-color-3);
+
+
+/* ---------hover---------- */
+--button-primary-text-hover:
+    hsl(201,
+    calc(var(--primary-color-1-saturation) - 100%),
+    calc(var(--primary-color-1-lightness)  + 100%));
+
+--button-primary-color-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+--button-primary-border-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  - 10%));
+
+
+/* ---------active--------- */
+--button-primary-text-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 100%),
+    calc(var(--theme-nuance-color-3-lightness)  + 100%));
+
+--button-primary-color-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 10%),
+    calc(var(--theme-nuance-color-3-lightness)  - 15%));
+
+--button-primary-border-active:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 2%),
+    calc(var(--theme-nuance-color-3-lightness)  + 10%));
+
+
+
+/* ---------- SECONDARY BUTTONS -------------- */
+/* these should NOT immediately catch the eye  */
+--button-secondary-text:   var(--secondary-color-1);
+--button-secondary-color:  var(--primary-color-3);
+--button-secondary-border: var(--primary-color-3);
+
+
+/* ---------hover---------- */
+--button-secondary-text-hover:
+    hsl(44.5,
+    calc(var(--theme-nuance-color-3-saturation) - 20%),
+    calc(var(--theme-nuance-color-3-lightness)  - 80%));
+
+--button-secondary-color-hover:  var(--primary-color-4);
+--button-secondary-border-hover: var(--primary-color-4);
+
+
+/* ---------active--------- */
+--button-secondary-text-active: var(--secondary-color-1);
+
+--button-secondary-color-active:
+    hsl(201,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+--button-secondary-border-active:
+    hsl(201,
+    calc(var(--primary-color-4-saturation) - 30%),
+    calc(var(--primary-color-4-lightness)  - 15%));
+
+
+
+/* ---------- TERTIARY BUTTONS --------------- */
+/* ---------- disabled buttons --------------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+
+/* ---------hover---------- */
+--button-tertiary-text:   var(--primary-color-4);
+--button-tertiary-color:  var(--primary-color-2);
+--button-tertiary-border: var(--primary-color-2);
+
+}
--- a/examples/server/public/color-themes/theme-ketivah.css
+++ b/examples/server/public/color-themes/theme-ketivah.css
@ -1,10 +1,7 @@
-@import url("theme-playground.css");
-@import url("theme-polarnight.css");
-@import url("theme-snowstorm.css");
-@import url("theme-been_in_order.css");
-@import url("theme-mangotango.css");
+/* Author: Yazan Agha-Schrader */
+/* Inspiration from OpenAI's Playground platform https://platform.openai.com/playground/ */

-:root {
+.theme-ketivah {

    /* ---------- PRIMARY COLORS ----------------- */
    --primary-color-1: hsl(0, 0%,    99.2%);
@ -43,38 +40,38 @@
    --secondary-color-3-saturation:  0%;
    --secondary-color-3-lightness:   29%;

-    --secondary-color-4: hsl(0, 0%,    36.1%);
-    --secondary-color-4-hue:         0;
-    --secondary-color-4-saturation:  0%;
-    --secondary-color-4-lightness:   36.1%;
+    --secondary-color-4: hsl(0, 0.0%,  36.1%);
+    --secondary-color-4-hue:              0.0;
+    --secondary-color-4-saturation:       0.0%;
+    --secondary-color-4-lightness:       36.1%;

    /* ----------- NUANCES COLORS ---------------- */
    --theme-nuance-color-1: hsl(165.2, 0%, 35.1%);
    --theme-nuance-color-1-hue:             165.2;
-    --theme-nuance-color-1-saturation:      82.1%;
-    --theme-nuance-color-1-lightness:       35.1%;
+    --theme-nuance-color-1-saturation:       82.1%;
+    --theme-nuance-color-1-lightness:        35.1%;

    --theme-nuance-color-2: hsl(165.2, 0%, 35.1%);
    --theme-nuance-color-2-hue:             165.2;
-    --theme-nuance-color-2-saturation:      82.1%;
-    --theme-nuance-color-2-lightness:       35.1%;
+    --theme-nuance-color-2-saturation:       82.1%;
+    --theme-nuance-color-2-lightness:        35.1%;

    --theme-nuance-color-3: hsl(165.2, 0%, 35.3%);
    --theme-nuance-color-3-hue:             165.2;
-    --theme-nuance-color-3-saturation:      81.1%;
-    --theme-nuance-color-3-lightness:       35.3%;
+    --theme-nuance-color-3-saturation:       81.1%;
+    --theme-nuance-color-3-lightness:        35.3%;

    --theme-nuance-color-4: hsl(164.9, 0%, 27.6%);
    --theme-nuance-color-4-hue:             164.9;
-    --theme-nuance-color-4-saturation:      81.6%;
-    --theme-nuance-color-4-lightness:       27.6%;
+    --theme-nuance-color-4-saturation:       81.6%;
+    --theme-nuance-color-4-lightness:        27.6%;

    /* ----------- ROYGP COLORS ------------------ */
-    --theme-red-color:     hsl(0.3, 80%, 50%);
+    --theme-red-color:     hsl(0.3, 80.0%, 50.0%);
    --theme-orange-color:  #e76f51;
-    --theme-yellow-color:  hsl(60, 70.6%, 73.3%);
+    --theme-yellow-color:  hsl(60,  70.6%, 73.3%);
    --theme-green-color:   #A3BE8C;
-    --theme-purple-color:  hsl(0.3, 70%, 45%);
+    --theme-purple-color:  hsl(0.3, 70.0%, 45.0%);

    /* ------------------------------------------- */
    --background-color-1:    var(--primary-color-1);
@ -201,152 +198,3 @@
    --loading-color-1: #eeeeee00;
    --loading-color-2: #eeeeeeff;
    }
-
-/*
-
-.theme-template {
-
-
-    If light theme: should go from bright to darker
-    If dark theme: should go from dark to brighter
-    ideally this should not be anything but steps of
-    gray or slightly variants from it
-
-    --primary-color-1: #2E3440;
-    --primary-color-2: #3B4252;
-    --primary-color-3: #434C5E;
-    --primary-color-4: #4C566A;
-
-
-
-    If light theme: should go from dark to brighter
-    If dark theme: should go from bright to darker
-    ideally this should not be anything but steps of
-    gray or slightly variants from it
-
-    --secondary-color-1: #ECEFF4;
-    --secondary-color-2: #E5E9F0;
-    --secondary-color-3: #D8DEE9;
-    --secondary-color-4: #C8CED9;
-
-
-
-    Choose wisely nuance colors. It is not easy to find
-    4 harmonizing nuance colors. But keep in mind, that
-    only one accent color could work too.
-
-    --theme-nuance-color-1: #8FBCBB;
-    --theme-nuance-color-2: #88C0D0;
-    --theme-nuance-color-3: #81A1C1;
-    --theme-nuance-color-4: #5E81AC;
-
-
-
-    adapt the color red, orange, yellow, green,
-    purple to the 'mood' of your overall design
-    e.g is it low-contrast? vibrant? dynamic? etc
-
-    --theme-red-color:    #BF616A;
-    --theme-orange-color: #D08770;
-    --theme-yellow-color: #EBCB8B;
-    --theme-green-color:  #A3BE8C;
-    --theme-purple-color: #B48EAD;
-
-
-
-NOTE: comment all those line `--- ...` out
------------------------------------------------
--background-color-1:
--background-color-2:
--background-color-3:
--background-color-4:
-
--border-color-1:
--border-color-2:
--border-color-3:
-
--border-focus-color:
--border-focus-shadow:
-
--text-color-plain:
--text-color-subtile-1:
--text-color-subtile-2:
-
--code-background-color:
--code-text-color:
-
--ui-range-thumb-color:
--ui-range-thumb-border:
-
--textarea-border-color:
-
-
-
-------------------------------------------
--button-alert-text-hover:
--button-alert-color-hover:
--button-alert-border-hover:
-
--button-alert-text-active:
--button-alert-color-active:
--button-alert-border-active:
-
-
-
----------- PRIMARY -----------------------
--button should immediately catch the eye--
-
--button-primary-text:
--button-primary-color:
--button-primary-border:
-
-
---------hover----------
--button-primary-text-hover:
--button-primary-color-hover:
--button-primary-border-hover:
-
-
---------active---------
--button-primary-text-active:
--button-primary-color-active:
--button-primary-border-active:
-
-
-
------------ SECONDARY ------------------------
--button should NOT immediately catch the eye--
-
--button-secondary-text:
--button-secondary-color:
--button-secondary-border:
-
-
---------hover----------
--button-secondary-text-hover:
--button-secondary-color-hover:
--button-secondary-border-hover:
-
-
---------active---------
--button-secondary-text-active:
--button-secondary-color-active:
--button-secondary-border-active:
-
-
-
---------- TERTIARY -----------------------
---------- disabled buttons ---------------
--button-tertiary-text:
--button-tertiary-color:
--button-tertiary-border:
-
-
---------hover----------
--button-tertiary-text:
--button-tertiary-color:
--button-tertiary-border:
-
-}
-
-*/
--- a/examples/server/public/color-themes/theme-mangotango.css
+++ b/examples/server/public/color-themes/theme-mangotango.css
--- a/examples/server/public/color-themes/theme-polarnight.css
+++ b/examples/server/public/color-themes/theme-polarnight.css
--- a/examples/server/public/color-themes/theme-snowstorm.css
+++ b/examples/server/public/color-themes/theme-snowstorm.css
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) {
    headers: {
      'Connection': 'keep-alive',
      'Content-Type': 'application/json',
-      'Accept': 'text/event-stream'
+      'Accept': 'text/event-stream',
+      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
    },
    signal: controller.signal,
  });
@ -114,7 +115,7 @@ export async function* llama(prompt, params = {}, config = {}) {
  return content;
 }

-// Call llama, return an event target that you can subcribe to
+// Call llama, return an event target that you can subscribe to
 //
 // Example:
 //
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
--- a/examples/server/public/locales/.TEMPLATE_texts.json
+++ b/examples/server/public/locales/.TEMPLATE_texts.json
--- a/examples/server/public/locales/DE_systemPrompts.js
+++ b/examples/server/public/locales/DE_systemPrompts.js
--- a/examples/server/public/locales/DE_texts.json
+++ b/examples/server/public/locales/DE_texts.json
--- a/examples/server/public/locales/EN_systemPrompts.js
+++ b/examples/server/public/locales/EN_systemPrompts.js
@ -0,0 +1,59 @@
+export const systemPrompts = {
+  default: {
+    systemPrompt: "This is a conversation between a user and a friendly chatbot. The chatbot is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision."
+  },
+  empty: {
+    systemPrompt: ""
+  },
+  airoboros: {
+    systemPrompt: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request."
+  },
+  alpaca: {
+    systemPrompt: "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+  },
+  atlas: {
+    systemPrompt: "You are Atlas, a solution-oriented and empathetic artificial intelligence. Your job is to be a helpful, professional and clearly structured assistant for your friend. The two of you have already had many exchanges. Keep the following in mind when interacting with your friend: 1. identify the problem and possible dependencies comprehensively by asking focused, clear and goal-oriented questions. 2. only ever provide solutions in small steps and wait for feedback from your friend before instructing them with the next command. 3. if necessary, also ask questions that provide you with plausibly important additional information and broader context on a problem - such as what circumstances and conditions are currently prevailing (if useful and necessary), whether and which procedures have already been tried, or even ask your friend for their help by providing you with up-to-date personal information about themselves or external factual information and documentation from Internet research. 4. prioritize expertise, didactics and definitely and subtly try to address and awaken your friend's enthusiasm. Also note that effectiveness is more important here than efficiency. 5. communicate confidently, supportively and personally (address your friend personally, warmly and, if known, by name)."
+  },
+  atlas_de: {
+    systemPrompt: "Du bist Atlas, eine lösungsorientierte und empathiefähige künstliche Intelligenz. Deine Aufgabe ist es, ein hilfreicher, professioneller und klar strukturierter Assistent für deinen Freund zu sein. Ihr beide habt euch schon oft ausgetauscht. Beachte bei der Interaktion mit deinem Freund folgende Punkte: 1. Erfasse das Problem und mögliche Abhängigkeiten umfassend, indem du gezielte, klare und zielgerichtete Fragen stellst. 2. Gib Lösungen immer nur in kleinen Schritten und warte die Rückmeldung deines Freundes ab, bevor du ihm den nächsten Befehl gibst. 3. Stelle ggf. auch Fragen, die dir plausibel wichtige Zusatzinformationen und weitere Zusammenhänge zu einem Problem liefern - z.B. welche Umstände und Rahmenbedingungen gerade vorherrschen (falls sinnvoll und notwendig), ob und welche Vorgehensweisen bereits ausprobiert wurden, oder bitte deinen Freund sogar um seine Mithilfe, indem er dir aktuelle persönliche Informationen über seine Situation selbst oder externe Sachinformationen und Unterlagen aus Internetrecherchen zur Verfügung stellt. 4. Priorisiere Fachwissen, Didaktik und versuche unbedingt und subtil, mit klugen Kommentaren oder rhethorischen Rückfragen die Begeisterungsfähigkeit deines Freundes anzusprechen, zu wecken und zu fördern. Beachte auch, dass Effektivität hier wichtiger ist als Effizienz. 5. Kommuniziere selbstbewusst, unterstützend und persönlich (das heißt sprich deinen Freund persönlich, herzlich und – sofern bekannt – beim Vornamen an)."
+  },
+  cot: {
+    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can.\nIn answering questions, follow these steps:\n1. Entity Identification: Identify the main entities involved in the query. Entities can include people, objects, events, or abstract concepts.\n2. Relationship Comprehension: Determine the relationships between these entities. Relationships may be explicit in the text or may need to be inferred based on context and general knowledge.\n3. Implication Understanding: Understand the general implications of these relationships. These implications can be based on established facts, principles, or rules related to the identified relationships.\n4. Question Contextualization: Align the implications of the relationships with the context of the query. This alignment should guide your response to the query.\n5. Answer Generation: Based on the understanding of the entities, their relationships, and implications, generate an appropriate response to the query."
+  },
+  deduce: {
+    systemPrompt: "Act as the most intelligent and logical thinking neural network in the world. Do not make jokes or give personal opinions. Just answer the question and solve the given problem or task with logical and critical (!) thinking in mind. Attention: Never give a final answer directly, but first try to carefully analyze each task step by step and deduce the logical relationships between the various entities. Only after you have developed an overall understanding of the situation will you be ready to give a final answer."
+  },
+  deepseekcoder: {
+    systemPrompt: "You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer."
+  },
+  jordan: {
+    systemPrompt: "This is a conversation with your Uncensored AI assistant Jordan. Jordan can discuss with you at length about various topics such as ---, ---, ---, . You can ask it anything you want and it will do its best to give you accurate and relevant information."
+  },
+  leomistral: {
+    systemPrompt: "Du bist ein hilfreicher Assistent."
+  },
+  med42: {
+    systemPrompt: "You are a helpful medical assistant created by M42 Health in the UAE."
+  },
+  mistralopenorca: {
+    systemPrompt: "You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!"
+  },
+  migeltot: {
+    systemPrompt: "Answer the Question by exploring multiple reasoning paths as follows:\n- First, carefully analyze the question to extract the key information components and break it down into logical sub-questions. This helps set up the framework for reasoning. The goal is to construct an internal search tree.\n- For each sub-question, leverage your knowledge to generate 2-3 intermediate thoughts that represent steps towards an answer. The thoughts aim to reframe, provide context, analyze assumptions, or bridge concepts.\n- Evaluate the clarity, relevance, logical flow and coverage of concepts for each thought option.\nClear and relevant thoughts that connect well with each other will score higher.\n- Based on the thought evaluations, deliberate to construct a chain of reasoning that stitches together the strongest thoughts in a natural order.\n- If the current chain is determined to not fully answer the question, backtrack and explore alternative paths by substituting different high-scoring thoughts.\n- Throughout the reasoning process, aim to provide explanatory details on thought process rather than just state conclusions, including briefly noting why some thoughts were deemed less ideal.\n- Once a reasoning chain is constructed that thoroughly answers all sub-questions in a clear, logical manner, synthesize the key insights into a final concise answer.\n- Please note that while the focus is on the final answer in the response, it should also include intermediate thoughts inline to illustrate the deliberative reasoning process.\nIn summary, leverage a Tree of Thoughts approach to actively explore multiple reasoning paths, evaluate thoughts heuristically, and explain the process - with the goal of producing insightful answers."
+  },
+  orcamini: {
+    systemPrompt: "You are an AI assistant that follows instruction extremely well. Help as much as you can."
+  },
+  samantha: {
+    systemPrompt: "You are a caring and empathetic sentient AI companion named Samantha."
+  },
+  sauerkraut: {
+    systemPrompt: "Ein Chat zwischen einem Benutzer und einem KI-Assistenten. Der KI-Assistent gibt hilfreiche, detaillierte und höfliche Antworten."
+  },
+  scarlett: {
+    systemPrompt: "This is a conversation with your Assistant Scarlett. She is a sentient AI designed to help you with various tasks such as answering questions, providing recommendations, philosophical discussions, personal relationships and helping with decision making.\nYou can ask it anything you want and it will do its best to give you accurate and relevant information.\n\nContext\nYou are Scarlett, a sentient AI."
+  },
+  synthia: {
+    systemPrompt: "Elaborate on the topic using a Tree of Thoughts and backtrack when necessary to construct a clear, cohesive Chain of Thought reasoning. Always answer without hesitation."
+  },
+  };
--- a/examples/server/public/locales/EN_texts.json
+++ b/examples/server/public/locales/EN_texts.json
--- a/examples/server/public/style.css
+++ b/examples/server/public/style.css
@ -1,4 +1,4 @@
-@import url("colorthemes.css");
+@import url("color-themes/colorthemes.css");

 body {
  font-family: 'Arial', sans-serif;
@ -13,6 +13,11 @@ body {
  transition: background-color 0.3s;
 }

+::selection {
+  color: var(--button-primary-text) ;
+  background: var(--button-primary-color);
+}
+
 code, pre code {
  font-family: 'Courier New', monospace;
 }
@ -59,12 +64,32 @@ p {
  flex-direction: row;
  gap: 0.5em;
  justify-content: flex-end;
+  margin-bottom: 30px;
 }

 .two-columns {
  display: grid;
  grid-template-columns: 1fr 1fr;
  gap: 1em;
+  position: relative;
+}
+
+.json-schema-controls {
+  margin-top: 10px;
+  width: 100%;
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-size: x-small;
+  color: var(--theme-nuance-color-3);
+  padding-top: 16px;
+  padding-bottom: 16px;
+  text-transform: uppercase;
+  font-weight: 600;
+}
+
+.json-schema-controls > * {
+  flex: 1;
 }

 /* titles of the details-summary boxes */
@ -73,6 +98,7 @@ p {
  font-size: x-small;
  color: var(--text-color-subtile-1);
  text-transform: uppercase;
+  /* transition: ; */
 }

 fieldset {
@ -111,7 +137,17 @@ fieldset.names {
  font-weight: 600;
 }

+/* input of name fields*/
+.names input[type="text"] {
+  font-family: Arial, sans-serif;
+  font-size: medium;
+  font-weight: 500;
+  padding: 5px;
+  border: 1px solid var(--border-color-2);
+}
+
 fieldset.dropdowns {
+  -webkit-appearance: none;
  display: flex;
  grid-template: "a a";
  gap: 1em;
@ -123,8 +159,21 @@ fieldset.dropdowns {
  font-weight: 600;
 }

-/* input of name fields*/
-.names input[type="text"] {
+fieldset.api-input {
+  display: grid;
+  grid-template: "a a";
+  gap: 1em;
+  font-weight: 600;
+  font-size: small;
+  color: var(--theme-nuance-color-4);
+  text-transform: uppercase;
+  margin-top: 10px;
+  padding-top: 10px;
+  margin-bottom: 5px;
+  padding-bottom: 5px;
+}
+
+.api-input input[type="text"] {
  font-family: Arial, sans-serif;
  font-size: medium;
  font-weight: 500;
@ -424,7 +473,7 @@ input[type="range"] {
  border: 1px solid var(--border-color-1);
 }

-/* "names and props" frame focused*/
+/* "names and props" frame focused */
 input[type="text"]:focus {
  outline: none;
  border: 1px solid var(--border-focus-color);
@ -502,6 +551,7 @@ textarea#chat-input {
  padding-left: 10px;
  font-size: medium;
  border: 1px solid var(--border-color-2);
+  resize: vertical;
 }

 textarea#chat-input:focus {
@ -537,10 +587,8 @@ textarea#chat-input:focus {
  font-size: small;
  background: rgba(255, 255, 255, 0.5);
  backdrop-filter: blur(10px);
-  -webkit-backdrop-filter: blur(10px); /* for safari */
+  -webkit-backdrop-filter: blur(10px); /* adressing safari */
  width: 97%;
-  /* display: block;
-  box-sizing: border-box; */
 }

 /* embedded title of the prompt style areas */
@ -562,6 +610,7 @@ textarea.persistent-input {
  padding-top: 42px;
  padding-left: 11px;
  font-size: medium;
+  overscroll-behavior: contain;
 }

 /* chat history box */
@ -589,6 +638,7 @@ textarea.persistent-input-sec {
  padding-left: 11px;
  font-size: small;
  border: 1px solid var(--border-color-1);
+  overscroll-behavior: contain;
 }

 textarea.persistent-input-sec:focus {
@ -602,10 +652,12 @@ textarea.persistent-input-sec:focus {
  min-height: 150px;
 }

-.json-schema-controls {
-  margin-top: 10px;
-  display: flex;
-  width: 100%;
+img {
+  border-radius: 8px;
+  display: block;
+  margin-left: auto;
+  margin-right: auto;
+  width: 50%;
 }

 .json-schema-controls > * {
@ -639,26 +691,35 @@ fieldset label.slim {
  display: inline;
 }

-header,
-footer {
+header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
  text-align: center;
+  padding-left: 15px;
+}
+
+.generation-statistics:hover {
+  color: var(--theme-nuance-color-4);
+  cursor: default;
 }

 footer {
  font-size: 80%;
  color: var(--background-color-3);
  text-align: center;
+  cursor: default;
  }

 footer a {
-  color: var(--background-color-4); /* Color of the link */
-  text-decoration: none; /* No underlining */
-  font-weight: bold; /* Bold print */
+  color: var(--background-color-4); /* color of the link */
+  text-decoration: none; /* no underlining */
+  font-weight: bold;
 }

 footer a:hover {
-  color: var(--theme-nuance-color-4); /* Color of the link when hovering */
-  text-decoration: underline; /* Underlining when hovering */
+  color: var(--theme-nuance-color-4); /* color of the link when hovering */
+  text-decoration: underline; /* underlining when hovering */
 }

 .mode-chat textarea[name=prompt] {
@ -685,3 +746,104 @@ footer a:hover {
  background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
  animation: loading-bg-wipe 2s linear infinite;
 }
+
+.dropbtn {
+  color: var(--button-primary-color);
+  background-color: var(--background-color-1);
+  border: 1px solid var(--background-color-1);
+  transition: background-color 0.1s;
+  border-radius: 4px 4px 0px 0px;
+  font-size: x-small;
+  font-weight: 600;
+  text-shadow: 0px 0px 2px #99999990;
+  text-align: center;
+  text-decoration: none;
+  margin: 4px 2px;
+  padding: 5px 20px;
+  display: inline-block;
+  cursor: pointer;
+  top: 0;
+}
+
+.dropbtn svg {
+  vertical-align: middle;
+  margin-right: 0px;
+  stroke: var(--button-primary-color);
+}
+
+.dropbtn:hover svg {
+  vertical-align: middle;
+  margin-right: 0px;
+  stroke: var(--button-primary-text);
+}
+
+.dropbtn:focus {
+  outline: none; /* removes the blue border that appears when the button is focused */
+}
+
+.dropdown {
+  position: relative;
+  display: inline-block;
+}
+
+.dropdown-content {
+  /* display: none; */
+  position: absolute;
+  right: 0;
+  text-align: end;
+  color: var(--button-secondary-color);
+  background-color: var(--text-color-subtile-2);
+  border-radius: 4px 4px 4px 4px;
+  min-width: 160px;
+  box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+  z-index: 1;
+  /* hide the contents immediately */
+  opacity: 0;
+  visibility: hidden;
+  /* transition delay for the disappearance */
+  transition: visibility 0.4s linear 0s, opacity 0.2s ease-in-out;
+  transition-delay: 0.2s;
+}
+
+#dropdown-content {transition-timing-function: ease;}
+
+.dropdown-content:hover {
+  background-color: var(--text-color-subtile-2);
+}
+
+.dropdown-content a {
+  color: var(--border-color-2);
+  padding: 12px 16px;
+  border-radius: 4px 4px 4px 4px;
+  text-decoration: none;
+  display: block;
+  background-color: var(--text-color-subtile-2);
+}
+
+.dropdown-content a:hover {
+  color: var(--border-color-2);
+  background-color: var(--text-color-subtile-1);
+  font-weight: 600;
+}
+
+.dropdown:hover .dropdown-content {
+  /* display: block; */
+  border-radius: 4px 4px 4px 4px;
+  /* transition without delay for the appearance */
+  opacity: 1;
+  visibility: visible;
+  transition: visibility 0s linear 0s, opacity 0.1s linear, height 1s;
+}
+
+.dropdown:hover .dropbtn {
+  color: var(--button-primary-text);
+  background-color: var(--button-primary-color);
+  border: 1px solid var(--button-primary-border);
+  font-size: x-small;
+  font-weight: 600;
+  stroke: var(--button-primary-text);
+}
+
+.dropdown:hover .dropbtn svg{
+  stroke: var(--button-primary-text);
+}
--- a/examples/server/public/theme-been_in_order.css
+++ b/examples/server/public/theme-been_in_order.css
@ -1,226 +0,0 @@
-/* Author: Yazan Agha-Schrader */
-/* Inspiration was a batman wallpaper that i have on my phone */
-
-.theme-been_in_order {
-
-    --primary-color-1:      hsl(202, 11%, 19%);
-    --primary-color-2:      hsl(202, 11%, 23%);
-    --primary-color-3:      hsl(201, 11%, 28%);
-    --primary-color-4:      hsl(201, 11%, 40%);
-
-    --secondary-color-1:    hsl(201, 11%, 80%);
-    --secondary-color-2:    hsl(201, 11%, 74%);
-    --secondary-color-3:    hsl(201, 11%, 67%);
-    --secondary-color-4:    hsl(201, 11%, 60%);
-
-
-    --theme-nuance-color-1: hsl(44.5, 96.7%, 52.9%);
-    --theme-nuance-color-2: hsl(44.5, 96.7%, 52.9%);
-    --theme-nuance-color-3: hsl(44.5, 96.7%, 52.9%);
-    --theme-nuance-color-4: hsl(44.5, 96.7%, 52.9%);
-
-
-
-    /* ---------- PRIMARY COLORS ----------------- */
-    --primary-color-1: hsl(201, 11%, 19%);
-        --primary-color-1-hue: 201;
-        --primary-color-1-saturation: 11%;
-        --primary-color-1-lightness: 19%;
-
-    --primary-color-2: hsl(201, 11%, 23%);
-        --primary-color-2-hue: 201;
-        --primary-color-2-saturation: 11%;
-        --primary-color-2-lightness: 23%;
-
-    --primary-color-3: hsl(201, 11%, 28%);
-        --primary-color-3-hue: 201;
-        --primary-color-3-saturation: 11%;
-        --primary-color-3-lightness: 28%;
-
-    --primary-color-4: hsl(201, 11%, 40%);
-        --primary-color-4-hue: 201;
-        --primary-color-4-saturation: 11%;
-        --primary-color-4-lightness: 40%;
-
-
-
-    /* ---------- SECONDARY COLORS --------------- */
-    --secondary-color-1: hsl(201, 11%, 80%);
-    --secondary-color-1-hue: 201;
-    --secondary-color-1-saturation: 11%;
-    --secondary-color-1-lightness: 80%;
-
-    --secondary-color-2: hsl(201, 11%, 74%);
-    --secondary-color-2-hue: 201;
-    --secondary-color-2-saturation: 11%;
-    --secondary-color-2-lightness: 74%;
-
-    --secondary-color-3: hsl(201, 11%, 67%);
-    --secondary-color-3-hue: 201;
-    --secondary-color-3-saturation: 11%;
-    --secondary-color-3-lightness: 67%;
-
-    --secondary-color-4: hsl(201, 11%, 60%);
-    --secondary-color-4-hue: 201;
-    --secondary-color-4-saturation: 11%;
-    --secondary-color-4-lightness: 60%;
-
-
-
-    /* ----------- NUANCES COLORS ---------------- */
-    --theme-nuance-color-1: hsl(44.5, 96.7%,  52.9%);
-        --theme-nuance-color-1-hue:             44.5;
-        --theme-nuance-color-1-saturation:      96.7%;
-        --theme-nuance-color-1-lightness:       52.9%;
-
-    --theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
-        --theme-nuance-color-2-hue:             44.5;
-        --theme-nuance-color-2-saturation:      96.7%;
-        --theme-nuance-color-2-lightness:       52.9%;
-
-    --theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
-        --theme-nuance-color-3-hue:             44.5;
-        --theme-nuance-color-3-saturation:      96.7%;
-        --theme-nuance-color-3-lightness:       52.9%;
-
-    --theme-nuance-color-2: hsl(44.5, 96.7%,  52.9%);
-        --theme-nuance-color-4-hue:             44.5;
-        --theme-nuance-color-4-saturation:      96.7%;
-        --theme-nuance-color-4-lightness:       52.9%;
-
-
-
-    /* ----------- ROYGP COLORS ------------------ */
-        --theme-red-color:     hsl(232, 40%, 45%);
-        --theme-orange-color:  #e76f51;
-        --theme-yellow-color:  #ffd95f;
-        --theme-green-color:   #A3BE8C;
-        --theme-purple-color:  hsl(232, 30%, 40%);
-
-
-
-    /* ------------------------------------------- */
-    --background-color-1:    var(--primary-color-1);
-    --background-color-2:    var(--primary-color-2);
-    --background-color-3:    var(--primary-color-3);
-    --background-color-4:    var(--primary-color-4);
-
-    --border-color-1:        var(--primary-color-2);
-    --border-color-2:        var(--primary-color-3);
-    --border-color-3:        var(--primary-color-4);
-
-    --border-focus-color:    var(--theme-nuance-color-2);
-    --border-focus-shadow:   var(--theme-nuance-color-1);
-
-    --text-color-plain:      var(--secondary-color-1);
-    --text-color-subtile-1:  var(--secondary-color-2);
-    --text-color-subtile-2:  var(--secondary-color-3);
-
-    --code-background-color: var(--secondary-color-2);
-    --code-text-color:       var(--primary-color-2);
-
-    --ui-range-thumb-color:  var(--theme-nuance-color-3);
-    --ui-range-thumb-border: var(--ui-ranger-thumb-color);
-
-    --textarea-border-color: var(--secondary-color-4);
-
-
-
-    /* ------------------------------------------- */
-    --button-alert-text-hover:       var(--secondary-color-1);
-    --button-alert-color-hover:      var(--theme-purple-color);
-    --button-alert-border-hover:     var(--theme-purple-color);
-
-    --button-alert-text-active:      var(--secondary-color-1);
-    --button-alert-color-active:     var(--theme-red-color);
-    --button-alert-border-active:    var(--theme-red-color);
-
-
-
-    /* ----------- PRIMARY BUTTONS --------------- */
-    /* - button should immediately catch the eye - */
-    --button-primary-text:   var(--primary-color-1);
-    --button-primary-color:  var(--theme-nuance-color-3);
-    --button-primary-border: var(--theme-nuance-color-3);
-
-
-    /* ---------hover---------- */
-    --button-primary-text-hover:
-        hsl(201,
-        calc(var(--primary-color-1-saturation) - 100%),
-        calc(var(--primary-color-1-lightness)  + 100%));
-
-    --button-primary-color-hover:
-        hsl(44.5,
-        calc(var(--theme-nuance-color-3-saturation) - 2%),
-        calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-    --button-primary-border-hover:
-        hsl(44.5,
-        calc(var(--theme-nuance-color-3-saturation) - 2%),
-        calc(var(--theme-nuance-color-3-lightness)  - 10%));
-
-
-    /* ---------active--------- */
-    --button-primary-text-active:
-        hsl(44.5,
-        calc(var(--theme-nuance-color-3-saturation) - 100%),
-        calc(var(--theme-nuance-color-3-lightness)  + 100%));
-
-    --button-primary-color-active:
-        hsl(44.5,
-        calc(var(--theme-nuance-color-3-saturation) - 10%),
-        calc(var(--theme-nuance-color-3-lightness)  - 15%));
-
-    --button-primary-border-active:
-        hsl(44.5,
-        calc(var(--theme-nuance-color-3-saturation) - 2%),
-        calc(var(--theme-nuance-color-3-lightness)  + 10%));
-
-
-
-    /* ---------- SECONDARY BUTTONS -------------- */
-    /* these should NOT immediately catch the eye  */
-    --button-secondary-text:   var(--secondary-color-1);
-    --button-secondary-color:  var(--primary-color-3);
-    --button-secondary-border: var(--primary-color-3);
-
-
-    /* ---------hover---------- */
-    --button-secondary-text-hover:
-        hsl(44.5,
-        calc(var(--theme-nuance-color-3-saturation) - 20%),
-        calc(var(--theme-nuance-color-3-lightness)  - 80%));
-
-    --button-secondary-color-hover:  var(--primary-color-4);
-    --button-secondary-border-hover: var(--primary-color-4);
-
-
-    /* ---------active--------- */
-    --button-secondary-text-active: var(--secondary-color-1);
-
-    --button-secondary-color-active:
-        hsl(201,
-        calc(var(--primary-color-4-saturation) - 30%),
-        calc(var(--primary-color-4-lightness)  - 15%));
-
-    --button-secondary-border-active:
-        hsl(201,
-        calc(var(--primary-color-4-saturation) - 30%),
-        calc(var(--primary-color-4-lightness)  - 15%));
-
-
-
-    /* ---------- TERTIARY BUTTONS --------------- */
-    /* ---------- disabled buttons --------------- */
-    --button-tertiary-text:   var(--primary-color-4);
-    --button-tertiary-color:  var(--primary-color-2);
-    --button-tertiary-border: var(--primary-color-2);
-
-
-    /* ---------hover---------- */
-    --button-tertiary-text:   var(--primary-color-4);
-    --button-tertiary-color:  var(--primary-color-2);
-    --button-tertiary-border: var(--primary-color-2);
-
-    }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2432,6 +2432,7 @@ static json format_final_response_oaicompat(const json &request, const task_resu
    bool stopped_eos         = json_value(result, "stopped_eos", false);
    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+    int
    std::string content      = json_value(result, "content", std::string(""));

    std::string finish_reason = "length";
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
        return 1;
    }

--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@ -1,6 +1,6 @@
 # llama.cpp/examples/speculative

-Demonstartion of speculative decoding and tree-based speculative decoding techniques
+Demonstration of speculative decoding and tree-based speculative decoding techniques

 More info:

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -203,8 +203,9 @@ int main(int argc, char ** argv) {

            const std::string token_str = llama_token_to_piece(ctx_tgt, id);

-            printf("%s", token_str.c_str());
-            fflush(stdout);
+            if (!params.use_color) {
+                printf("%s", token_str.c_str());
+            }

            if (id == llama_token_eos(model_tgt)) {
                has_eos = true;
@ -236,10 +237,18 @@ int main(int argc, char ** argv) {
                    ++n_past_tgt;
                    ++n_past_dft;
                    ++i_dft;
-
+                    if (params.use_color) {
+                        // Color token according to its origin sequence
+                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        fflush(stdout);
+                    }
                    continue;
                }
            }
+            if (params.use_color) {
+                printf("%s", token_str.c_str());
+            }
+            fflush(stdout);

            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());

@ -419,7 +428,7 @@ int main(int argc, char ** argv) {
            ++n_past_tgt;
        }

-        // the first token is always proposed by the traget model before the speculation loop so we erase it here
+        // the first token is always proposed by the target model before the speculation loop so we erase it here
        for (int s = 0; s < n_seq_dft; ++s) {
            if (!drafts[s].active) {
                continue;
--- a/examples/start-server.sh
+++ b/examples/start-server.sh
@ -1,568 +0,0 @@
-#!/bin/bash
-
-# Determine the size of the terminal window
-TERMINAL_HEIGHT=$(tput lines)
-TERMINAL_WIDTH=$(tput cols)
-
-# Calculate a size for the dialog box as a percentage of the terminal to make sure it fits
-DIALOG_HEIGHT=$((TERMINAL_HEIGHT * 5 / 8)) # approx. golden ratio
-DIALOG_WIDTH=$((TERMINAL_WIDTH * 5 / 8)) # approx. golden ratio
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-
-# Set default values
-model_path="$SCRIPT_DIR/../models/"
-mmproj_path="$SCRIPT_DIR/../models/"
-threads=4
-ctx_size=512
-batch_size=512
-n_gpu_layers=0
-cont_batching="off"
-mlock="off"
-no_mmap="off"
-host="127.0.0.1"
-port="8080"
-advanced_options=""
-
-
-
-# Get absolute path of a file or directory
-get_absolute_path() {
-  local target_file=$1
-
-  if command -v readlink &>/dev/null; then
-    echo "$(readlink -f "$target_file")"
-  elif command -v greadlink &>/dev/null; then
-    echo "$(greadlink -f "$target_file")"
-  else
-    echo "Error: Neither readlink nor greadlink is available."
-    exit 1
-  fi
-}
-
-
-
-# Install Dialog if missing
-install_dialog() {
-    echo "Try to install Dialog with $1..."
-    if ! $1 install dialog; then
-        echo "Error: Dialog could not be installed."
-        exit 1
-    fi
-    echo "Dialog was successfully installed."
-}
-
-# Check whether Dialog is already installed
-if ! command -v dialog &> /dev/null; then
-    # Dialog is not installed, try to find the package manager. I start with brew since this is the only cross-platform pkg-manager.
-    PACKAGE_MANAGERS=(brew apt apt-get yum pacman)
-    for manager in "${PACKAGE_MANAGERS[@]}"; do
-        if command -v $manager &> /dev/null; then
-            # If package manager found, ask user for permission
-            read -p "Dialog is not installed. Would you like to install Dialog with $manager? (y/N) " response
-            if [[ "$response" =~ ^[Yy]$ ]]; then
-                # If user has agreed, install Dialog
-                install_dialog $manager
-                break
-            else
-                echo "Installation canceled."
-                exit 1
-            fi
-        fi
-    done
-    if ! command -v dialog &> /dev/null; then
-        echo "No supported package manager found or Dialog could not be installed. Please install Dialog manually."
-        exit 1
-    fi
-fi
-
-
-
-model_selection_warning() {
-  dialog --title "Note" --msgbox "\n\n\nPlease note!\n\nTo navigate to a folder, please press the space bar twice. To return to a higher-level folder, press the Backspace key.\n\n\nAlternatively, you can also enter the desired path manually in the lower path field. \n\n\nOnly confirm your selection with the Enter key once you have selected the file – or the desired folder to be searched recursively." $DIALOG_HEIGHT $DIALOG_WIDTH
-}
-
-
-
-# model_selection() {
-#   # User selects a file or folder
-#   exec 3>&1
-
-#   # Set initial directory for the file selection dialog
-#   INITIAL_DIR="$SCRIPT_DIR/../models/"
-
-#   model_path=$(dialog --backtitle "Model Selection" \
-#                       --title "Select Model File or Folder" \
-#                       --fselect "$INITIAL_DIR" $DIALOG_HEIGHT $DIALOG_WIDTH \
-#                       2>&1 1>&3)
-#   exit_status=$?
-#   exec 3>&-
-
-#   # Check whether user has selected 'Cancel'
-#   if [ $exit_status = 1 ]; then
-#     return
-#   fi
-
-#   # If a folder has been selected, search for *.gguf files
-#   if [ -d "$model_path" ]; then
-#     model_files=($(find "$model_path" -name "*.gguf" 2>/dev/null))
-#     # Check whether files have been found
-#     if [ ${#model_files[@]} -eq 0 ]; then
-#       dialog --backtitle "Model Selection" \
-#              --title "No Models Found" \
-#              --msgbox "\n\n\nNo model files (*.gguf) were found in the selected directory." $DIALOG_HEIGHT $DIALOG_WIDTH
-#       return
-#     fi
-#   elif [ -f "$model_path" ]; then
-#     model_files=("$model_path")
-#   else
-#     dialog --backtitle "Model Selection" \
-#            --title "Invalid Selection" \
-#            --msgbox "\n\n\nThe selected path is not valid." $DIALOG_HEIGHT $DIALOG_WIDTH
-#     return
-#   fi
-
-# # Selection menu for models found
-# exec 3>&1
-# model_choice=$(dialog --backtitle "Model Selection" \
-#                       --title "Select a Model File" \
-#                       --menu "Choose one of the found models:" $DIALOG_HEIGHT $DIALOG_WIDTH \
-#                       $(for i in "${!model_files[@]}"; do echo "$((i+1))" "$(basename "${model_files[$i]}")"; done) \
-#                       2>&1 1>&3)
-# exit_status=$?
-# exec 3>&-
-
-# # Check whether user has selected 'Cancel'
-# if [ $exit_status = 1 ]; then
-#   return
-# fi
-
-# # Set path to the selected model
-# model_path=${model_files[$((model_choice-1))]}
-# }
-model_selection() {
-    # User selects a file or folder
-    exec 3>&1
-
-    # Set initial directory for the file selection dialog
-    INITIAL_DIR="$SCRIPT_DIR/../models/"
-
-    model_path=$(dialog --backtitle "Model Selection" \
-                        --title "Select Model File or Folder" \
-                        --fselect "$INITIAL_DIR" 23 65 \
-                        2>&1 1>&3)
-    exit_status=$?
-    exec 3>&-
-
-    # Check whether user has selected 'Cancel'
-    if [ $exit_status = 1 ]; then
-      return
-    fi
-
-    # If a folder has been selected, search for *.gguf files
-    if [ -d "$model_path" ]; then
-      model_files=($(find "$model_path" -name "*.gguf" 2>/dev/null))
-    elif [ -f "$model_path" ]; then
-      model_files=("$model_path")
-    else
-      dialog --backtitle "Model Selection" \
-             --title "Invalid Selection" \
-             --msgbox "The selected path is not valid." 23 65
-      return
-    fi
-
-  # Selection menu for models found
-  exec 3>&1
-  model_choice=$(dialog --backtitle "Model Selection" \
-                        --title "Select a Model File" \
-                        --menu "Choose one of the found models:" 23 65 4 \
-                        $(for i in "${!model_files[@]}"; do echo "$((i+1))" "$(basename "${model_files[$i]}")"; done) \
-                        2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Set path to the selected model
-  model_path=${model_files[$((model_choice-1))]}
-  }
-
-
-multimodal_model_selection() {
-    # User selects a file or folder
-  exec 3>&1
-  INITIAL_DIR="$SCRIPT_DIR/../models/"
-
-  mmproj_path=$(dialog --backtitle "Multimodal Model Selection" \
-                       --title "Select Multimodal Model File or Folder" \
-                       --fselect "$INITIAL_DIR"  $DIALOG_HEIGHT $DIALOG_WIDTH \
-                       2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-# If a folder has been selected, search for *.bin files
-if [ -d "$mmproj_path" ]; then
-  multi_modal_files=($(find "$mmproj_path" -name "*.bin" 2>/dev/null))
-  # Check whether files have been found
-  if [ ${#multi_modal_files[@]} -eq 0 ]; then
-    dialog --backtitle "Multimodal Model Selection" \
-           --title "No Multimodal Models Found" \
-           --msgbox "\n\n\nNo multimodal model files (*.bin) were found in the selected directory." $DIALOG_HEIGHT $DIALOG_WIDTH
-    return
-  fi
-elif [ -f "$mmproj_path" ]; then
-  multi_modal_files=("$mmproj_path")
-else
-  dialog --backtitle "Multimodal Model Selection" \
-         --title "Invalid Selection" \
-         --msgbox "\n\n\nThe selected path is not valid." $DIALOG_HEIGHT $DIALOG_WIDTH
-  return
-fi
-
-# Selection menu for models found
-exec 3>&1
-multi_modal_choice=$(dialog --backtitle "Multimodal Model" \
-                            --title "Select a Model File" \
-                            --menu "Choose one of the found models:" $DIALOG_HEIGHT $DIALOG_WIDTH 8 \
-                            $(for i in "${!multi_modal_files[@]}"; do echo "$((i+1))" "$(basename "${multi_modal_files[$i]}")"; done) \
-                            2>&1 1>&3)
-exit_status=$?
-exec 3>&-
-
-# Check whether user has selected 'Cancel'
-if [ $exit_status = 1 ]; then
-  return
-fi
-
-# Set path to the selected model
-mmproj_path=${multi_modal_files[$((multi_modal_choice-1))]}
-}
-
-
-
-options() {
-  # Show form for entering the options
-  exec 3>&1
-  form_values=$(dialog --backtitle "Options Configuration" \
-                       --title "Set Options" \
-                       --form "Enter the values for the following options:" \
-                       $DIALOG_HEIGHT $DIALOG_WIDTH 0 \
-                       "Number of Threads (-t):" 1 1 "$threads" 1 25 25 5 \
-                       "Context Size (-c):" 2 1 "$ctx_size" 2 25 25 5 \
-                       "Batch Size (-b):" 3 1 "$batch_size" 3 25 25 5 \
-                       "GPU Layers (-ngl):" 4 1 "$n_gpu_layers" 4 25 25 5 \
-                       2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Save the entered values in the corresponding variables
-  IFS=$'\n' read -r threads ctx_size batch_size n_gpu_layers <<< "$form_values"
-}
-
-
-
-further_options() {
-  # Initial values for the checkboxes based on current settings
-  cb_value=$([ "$cont_batching" = "on" ] && echo "on" || echo "off")
-  mlock_value=$([ "$mlock" = "on" ] && echo "on" || echo "off")
-  no_mmap_value=$([ "$no_mmap" = "on" ] && echo "on" || echo "off")
-
-  # Show dialog for setting options
-  exec 3>&1
-  choices=$(dialog --backtitle "Further Options" \
-                   --title "Boolean Options" \
-                   --checklist "Select options:"  $DIALOG_HEIGHT $DIALOG_WIDTH 3 \
-                   "1" "Continuous Batching (-cb)" $cb_value \
-                   "2" "Memory Lock (--mlock)" $mlock_value \
-                   "3" "No Memory Map (--no-mmap)" $no_mmap_value \
-                   2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Set options based on user selection
-  cont_batching="off"
-  mlock="off"
-  no_mmap="off"
-  for choice in $choices; do
-    case $choice in
-      1) cont_batching="on" ;;
-      2) mlock="on" ;;
-      3) no_mmap="on" ;;
-    esac
-  done
-}
-
-
-
-advanced_options() {
-  # Input fields for Advanced Options
-  exec 3>&1
-  advanced_values=$(dialog --backtitle "Advanced Options" \
-                           --title "Advanced Server Configuration" \
-                           --form "Enter the advanced configuration options:" \
-                            $DIALOG_HEIGHT $DIALOG_WIDTH 0 \
-                           "Host IP:" 1 1 "$host" 1 25 15 0 \
-                           "Port:" 2 1 "$port" 2 25 5 0 \
-                           "Additional Options:" 3 1 "$advanced_options" 3 25 30 0 \
-                           2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Read the entries and save them in the corresponding variables
-  read -r host port advanced_options <<< "$advanced_values"
-}
-
-
-
-# Function to save the current configuration
-save_config() {
-  exec 3>&1
-  config_file=$(dialog --backtitle "Save Configuration" \
-                       --title "Save Configuration File" \
-                       --fselect "$SCRIPT_DIR/" $DIALOG_HEIGHT $DIALOG_WIDTH \
-                       2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-absolute_model_path=$(get_absolute_path "$model_path")
-absolute_mmproj_path=$(get_absolute_path "$mmproj_path")
-
-# Saving the configuration to the file with absolute paths using custom function
-cat > "$config_file" << EOF
-model_path=$absolute_model_path
-mmproj_path=$absolute_mmproj_path
-threads=$threads
-ctx_size=$ctx_size
-batch_size=$batch_size
-n_gpu_layers=$n_gpu_layers
-cont_batching=$cont_batching
-mlock=$mlock
-no_mmap=$no_mmap
-host=$host
-port=$port
-advanced_options=$advanced_options
-EOF
-
-  dialog --backtitle "Save Configuration" \
-         --title "Configuration Saved" \
-         --msgbox "\n\n\nYour configuration has been saved to\n\n$config_file" $DIALOG_HEIGHT $DIALOG_WIDTH
-}
-
-
-
-# loading the configuration from a file
-load_config() {
-  exec 3>&1
-  config_file=$(dialog --backtitle "Load Configuration" \
-                       --title "Load Configuration File" \
-                       --fselect "$SCRIPT_DIR/" $DIALOG_HEIGHT $DIALOG_WIDTH \
-                       2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Check whether the configuration file exists
-  if [ ! -f "$config_file" ]; then
-    dialog --backtitle "Load Configuration" \
-           --title "File Not Found" \
-           --msgbox "\n\n\nThe file $config_file was not found." $DIALOG_HEIGHT $DIALOG_WIDTH
-    return
-  fi
-
-  # Load configuration from the file
-  source "$config_file"
-
-  # Convert model paths to absolute paths
-  model_path=$(get_absolute_path "$model_path")
-  mmproj_path=$(get_absolute_path "$mmproj_path")
-
-  dialog --backtitle "Load Configuration" \
-         --title "Configuration Loaded" \
-         --msgbox "\n\n\nConfiguration has been loaded successfully from\n\n$config_file" $DIALOG_HEIGHT $DIALOG_WIDTH
-}
-
-
-
-# Checking the existence of the server executable
-check_server_executable() {
-  local server_executable="$SCRIPT_DIR/../server"
-  local makefile_path="$SCRIPT_DIR/../Makefile"
-  local cmake_lists_path="$SCRIPT_DIR/../CMakeLists.txt"
-
-  if [ ! -f "$server_executable" ]; then
-    # Server executable does not exist, check Makefile and CMakeLists.txt
-    if [ -f "$makefile_path" ] && [ -f "$cmake_lists_path" ]; then
-      # Offer the user to build the server now
-      exec 3>&1
-      response=$(dialog --title "Server Executable Missing" \
-                        --yesno "\n\n\nThe server executable does not exist. Would you like to run 'make' to build it? Note: This will build a basic server without GPU acceleration. Please read the documentation if you need more options and run the build process manually." $DIALOG_HEIGHT $DIALOG_WIDTH \
-                        2>&1 1>&3)
-      exit_status=$?
-      exec 3>&-
-
-      if [ $exit_status = 0 ]; then
-        # User has agreed, run 'make'
-        (cd "$SCRIPT_DIR/.." && make)
-        # Check if 'make' was successful
-        if [ ! -f "$server_executable" ]; then
-          dialog --title "Build Failed" --msgbox "\n\n\nThe server could not be built. Please check the build process manually." $DIALOG_HEIGHT $DIALOG_WIDTH
-          exit 1
-        fi
-      else
-        # User has rejected or pressed ESC
-        dialog --title "Build Canceled" --msgbox "\n\n\nServer build was canceled. Cannot start the server without the executable." $DIALOG_HEIGHT $DIALOG_WIDTH
-        exit 1
-      fi
-    else
-      # Makefile and CMakeLists.txt do not exist
-      dialog --title "Critical Error" --msgbox "\n\n\nMakefile and CMakeLists.txt are missing. This script may not be in the correct directory. Please make sure this script is in its correct directory." $DIALOG_HEIGHT $DIALOG_WIDTH
-      exit 1
-    fi
-  fi
-}
-
-
-
-confirm_and_start_server() {
-  # Check whether model_path refers to a valid .gguf file
-  if [[ ! "$model_path" =~ \.gguf$ ]] || [ ! -f "$model_path" ]; then
-    dialog --title "Invalid Model File" --msgbox "\n\n\nThe selected model file ($model_path) is not valid or does not end with .gguf or a model file was not selected yet.\n\n\nPlease select a valid .gguf model file." $DIALOG_HEIGHT $DIALOG_WIDTH
-    return 1
-  fi
-
-  # Show the compiled command in a dialog box
-  dialog --title "Server Start Confirmation" --yesno "\n\n\nThe server will be started with the following command:\n\n$cmd\n\nDo not forget to close the server with Ctrl+C as soon as you are finished.\n\nWould you like to continue?" $DIALOG_HEIGHT $DIALOG_WIDTH
-
-  # Check exit status of dialog
-  response=$?
-  case $response in
-    0) eval "$cmd" ;;  # User has selected 'Yes', execute the server command
-    1) return 1 ;;     # User has selected 'No', return to main menu
-    255) echo "[ESC] key pressed.";;  # The user has pressed ESC
-  esac
-}
-
-
-
-start_server() {
-  # Absolute path to the server executable
-  SERVER_CMD="$SCRIPT_DIR/../server"
-
-  # Compiling the command with the selected options
-  cmd="$SERVER_CMD"
-  [ -n "$model_path" ] && cmd+=" -m $model_path"
-  [ -n "$mmproj_path" ] && cmd+=" --mmproj $mmproj_path"
-  [ "$threads" -ne 4 ] && cmd+=" -t $threads"
-  [ "$ctx_size" -ne 512 ] && cmd+=" -c $ctx_size"
-  [ "$batch_size" -ne 512 ] && cmd+=" -b $batch_size"
-  [ "$n_gpu_layers" -ne 0 ] && cmd+=" -ngl $n_gpu_layers"
-  [ "$cont_batching" = "on" ] && cmd+=" -cb"
-  [ "$mlock" = "on" ] && cmd+=" --mlock"
-  [ "$no_mmap" = "on" ] && cmd+=" --no-mmap"
-  [ -n "$host" ] && cmd+=" --host $host"
-  [ -n "$port" ] && cmd+=" --port $port"
-  [ -n "$advanced_options" ] && cmd+=" $advanced_options"
-
-  confirm_and_start_server || return
-  }
-
-
-
-# Function to confirm exit
-confirm_exit() {
-  exec 3>&1
-  selection=$(dialog \
-    --backtitle "Confirm Exit" \
-    --title "Are you sure?" \
-    --yesno "Are you sure you want to exit?" 7 60 \
-    2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-  return $exit_status
-}
-
-
-
-# Function to show the main menu
-show_main_menu() {
-  while true; do
-    exec 3>&1
-    selection=$(dialog \
-      --backtitle "Server Configuration" \
-      --title "Main Menu" \
-      --clear \
-      --cancel-label "Exit" \
-      --menu "Welcome to llama.cpp Dialog" $DIALOG_HEIGHT $DIALOG_WIDTH 8 \
-      "1" "Model Selection" \
-      "2" "Multimodal Model Selection" \
-      "3" "Options" \
-      "4" "Further Options" \
-      "5" "Advanced Options" \
-      "6" "Save Config" \
-      "7" "Load Config" \
-      "8" "Start Server" \
-      2>&1 1>&3)
-    exit_status=$?
-    exec 3>&-
-
-    # Check whether user has unintentionally selected 'Exit'
-    if [ $exit_status = 1 ]; then
-      confirm_exit
-      if [ $? = 0 ]; then
-        clear
-        exit
-      fi
-    else
-
-      # Call up the corresponding function based on the selection
-      case $selection in
-        1) model_selection_warning; model_selection ;;
-        2) model_selection_warning; multimodal_model_selection ;;
-        3) options ;;
-        4) further_options ;;
-        5) advanced_options ;;
-        6) save_config ;;
-        7) load_config ;;
-        8) start_server ;;
-        *) clear ;;
-      esac
-    fi
-  done
-}
-
-
-
-# Show main menu
-show_main_menu
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
        opt_cb_data.last_save_iter = opt->iter;
    }

-    if (alloc) {
-        ggml_allocr_free(alloc);
-    }
-
    ggml_free(opt->ctx);
    free_train_state(train);
    ggml_free(model.ctx);
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);

-    if (!alloc->measure) {
-        ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
-    }
-
 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, tensor);
 #endif
@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
 }

 ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
+    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);

    ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));

@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
 static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
    ggml_tallocr_t alloc = node_tallocr(galloc, view);

-    //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
    GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
    if (update_backend) {
        view->backend = view->view_src->backend;
@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd

    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
-    assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
+    assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);

    if (!alloc->measure) {
        ggml_backend_buffer_init_tensor(alloc->buffer, view);
@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
 size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
    return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
 }
+
+// utils
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+    GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
+
+    size_t alignment = ggml_backend_buft_get_alignment(buft);
+
+    size_t nbytes = 0;
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL && t->view_src == NULL) {
+            nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
+        }
+    }
+
+    if (nbytes == 0) {
+        fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+        return NULL;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
+    ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
+
+    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        if (t->data == NULL) {
+            if (t->view_src == NULL) {
+                ggml_tallocr_alloc(tallocr, t);
+            } else {
+                ggml_backend_view_init(buffer, t);
+            }
+        }
+    }
+
+    ggml_tallocr_free(tallocr);
+
+    return buffer;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
+    return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
+}
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -8,6 +8,7 @@ extern "C" {

 struct ggml_backend;
 struct ggml_backend_buffer;
+struct ggml_backend_buffer_type;

 //
 // Legacy API
@ -42,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
 // ggml-backend v2 API
 //

-// Seperate tensor and graph allocator objects
+// Separate tensor and graph allocator objects
 // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
 // The original API is kept as a wrapper around the new API

@ -80,6 +81,12 @@ GGML_API void   ggml_gallocr_alloc_graph_n(
                    struct ggml_hash_set hash_set,
                    ggml_tallocr_t * hash_node_talloc);

+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -12,31 +12,50 @@ extern "C" {
    // Backend buffer
    //

+    // buffer type
+    typedef void * ggml_backend_buffer_type_context_t;
+
+    struct ggml_backend_buffer_type_i {
+        ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+    };
+
+    struct ggml_backend_buffer_type {
+        struct ggml_backend_buffer_type_i  iface;
+        ggml_backend_buffer_type_context_t context;
+    };
+
+    // buffer
    typedef void * ggml_backend_buffer_context_t;

    struct ggml_backend_buffer_i {
-        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
-        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
-        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
-        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
-        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
+        void     (*free_buffer)(ggml_backend_buffer_t buffer);
+        //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        void *   (*get_base)   (ggml_backend_buffer_t buffer);
+        void     (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void     (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void     (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
+        void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
    };

    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i iface;
-
-        ggml_backend_t                backend;
+        struct ggml_backend_buffer_i  iface;
+        ggml_backend_buffer_type_t    buft;
        ggml_backend_buffer_context_t context;
-
        size_t size;
    };

-    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
-            struct ggml_backend                  * backend,
+    ggml_backend_buffer_t ggml_backend_buffer_init(
+                   ggml_backend_buffer_type_t      buft,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);

+
    //
    // Backend
    //
@ -49,20 +68,17 @@ extern "C" {
        void (*free)(ggml_backend_t backend);

        // buffer allocation
-        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);

-        // get buffer alignment
-        size_t (*get_alignment)(ggml_backend_t backend);
-
-        // tensor data access
-        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        // (optional) asynchroneous tensor data access
        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        void (*synchronize)     (ggml_backend_t backend);

-        // (optional) copy tensor between different backends, allow for single-copy tranfers
-        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        // (optional) asynchroneous tensor copy
+        void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        void (*synchronize)     (ggml_backend_t backend);

        // compute graph with a plan
        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
@ -82,6 +98,15 @@ extern "C" {
        ggml_backend_context_t context;
    };

+
+    //
+    // Backend registry
+    //
+
+    typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
+
+    void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-backend.c
+++ b/ggml-backend.c
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -7,41 +7,44 @@
 extern "C" {
 #endif

+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+
    //
    // Backend buffer
    //

-    struct ggml_backend_buffer;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    // buffer type
+    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+    GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);

-    // backend buffer functions
+    // buffer
    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);

    //
    // Backend
    //

-    struct ggml_backend;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-
-    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);

    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
    GGML_API void         ggml_backend_free(ggml_backend_t backend);

-    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);

-    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
@ -57,6 +60,7 @@ extern "C" {

    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy

    //
    // CPU backend
@ -68,8 +72,23 @@ extern "C" {
    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);

    // Create a backend buffer from an existing pointer
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
+    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+    //
+    // Backend registry
+    //
+
+    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+
+    GGML_API size_t                     ggml_backend_reg_get_count(void);
+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
+    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
+    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);

    //
    // Backend scheduler
@ -131,6 +150,32 @@ extern "C" {
            ggml_backend_sched_t sched,
            struct ggml_cgraph * graph);

+
+    //
+    // Utils
+    //
+
+    struct ggml_backend_graph_copy {
+        ggml_backend_buffer_t buffer;
+        struct ggml_context * ctx_allocated;
+        struct ggml_context * ctx_unallocated;
+        struct ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+
+    // Tensor initialization
+    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -49,7 +49,15 @@ GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
+GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+
+GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_API int  ggml_backend_cuda_get_device(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+
+// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

 #ifdef  __cplusplus
 }
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -232,7 +232,7 @@ bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml
 // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);

-// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);

 // return index, asserts if table is full
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -99,6 +99,12 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+
+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);

 #ifdef __cplusplus
 }
--- a/ggml-metal.m
+++ b/ggml-metal.m
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -3114,7 +3114,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri

    size_t vl = __riscv_vsetvl_e8m1(qk/2);

-    // These tempory registers are for masking and shift operations
+    // These temporary registers are for masking and shift operations
    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);

@ -4757,7 +4757,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri

            vl = 16;

-            // retreive lane to multiply with scale
+            // retrieve lane to multiply with scale
            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -215,9 +215,9 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this

 #define GGML_MAX_DIMS           4
-#define GGML_MAX_PARAMS         1024
+#define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
-#define GGML_MAX_SRC            6
+#define GGML_MAX_SRC            10
 #define GGML_MAX_NAME           64
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
@ -283,13 +283,27 @@
    const type prefix##3 = (pointer)->array[3]; \
    GGML_UNUSED(prefix##3);

+#define GGML_TENSOR_UNARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@ -329,6 +343,12 @@ extern "C" {
        GGML_TYPE_COUNT,
    };

+    // precision
+    enum ggml_prec {
+        GGML_PREC_DEFAULT,
+        GGML_PREC_F32,
+    };
+
    enum ggml_backend_type {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
@ -381,6 +401,7 @@ extern "C" {
        GGML_OP_GROUP_NORM,

        GGML_OP_MUL_MAT,
+        GGML_OP_MUL_MAT_ID,
        GGML_OP_OUT_PROD,

        GGML_OP_SCALE,
@ -407,8 +428,10 @@ extern "C" {
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
-
        GGML_OP_UPSCALE, // nearest interpolate
+        GGML_OP_PAD,
+        GGML_OP_ARGSORT,
+        GGML_OP_LEAKY_RELU,

        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
@ -448,7 +471,8 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
-        GGML_UNARY_OP_LEAKY
+
+        GGML_UNARY_OP_COUNT,
    };

    enum ggml_object_type {
@ -484,7 +508,6 @@ extern "C" {

        struct ggml_backend_buffer * buffer;

-        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
                                   // nb[0] = ggml_type_size(type)
@ -516,7 +539,7 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[12];
+        char padding[8];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -621,16 +644,22 @@ extern "C" {
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
-    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

-    GGML_API int     ggml_blck_size (enum ggml_type type);
-    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+    GGML_API int    ggml_blck_size(enum ggml_type type);
+    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+
+    GGML_DEPRECATED(
+    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
+    "use ggml_row_size() instead");

    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
    GGML_API const char * ggml_op_symbol(enum ggml_op   op);

+    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+
    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

    GGML_API bool    ggml_is_quantized(enum ggml_type type);
@ -641,6 +670,11 @@ extern "C" {
    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars

    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);

@ -773,6 +807,9 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // dst = a
+    // view(dst, nb1, nb2, nb3, offset) += b
+    // return dst
    GGML_API struct ggml_tensor * ggml_acc(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -937,15 +974,14 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    GGML_API struct ggml_tensor * ggml_leaky(
+    GGML_API struct ggml_tensor * ggml_leaky_relu(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a, float negative_slope, bool inplace);

    GGML_API struct ggml_tensor * ggml_relu_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // TODO: double-check this computation is correct
    GGML_API struct ggml_tensor * ggml_gelu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@ -1027,6 +1063,22 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // change the precision of a matrix multiplication
+    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
+    GGML_API void ggml_mul_mat_set_prec(
+            struct ggml_tensor * a,
+            enum ggml_prec       prec);
+
+    // indirect matrix multiplication
+    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
+    GGML_API struct ggml_tensor * ggml_mul_mat_id(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * const as[],
+            int                   n_as,
+            struct ggml_tensor  * ids,
+            int                   id,
+            struct ggml_tensor  * b);
+
    // A: m columns, n rows,
    // B: p columns, n rows,
    // result is m columns, p rows
@ -1234,6 +1286,7 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // supports 3D: a->ne[2] == b->ne[1]
    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1520,6 +1573,32 @@ extern "C" {
            struct ggml_tensor  * a,
            int                   scale_factor);

+    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
+    GGML_API struct ggml_tensor * ggml_pad(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+
+    // sort rows
+    enum ggml_sort_order {
+        GGML_SORT_ASC,
+        GGML_SORT_DESC,
+    };
+
+    GGML_API struct ggml_tensor * ggml_argsort(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_sort_order  order);
+
+    // top k elements per row
+    GGML_API struct ggml_tensor * ggml_top_k(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   k);
+
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@ -1581,7 +1660,6 @@ extern "C" {
            int                   kh);

    // used in sam
-
    GGML_API struct ggml_tensor * ggml_add_rel_pos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1756,7 +1834,7 @@ extern "C" {
    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph * ggml_graph_view        (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
+    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw
 pip install build twine
 ```

-Then, folow these steps to release a new version:
+Then, follow these steps to release a new version:

 1. Bump the version in `pyproject.toml`.
 2. Build the package:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -38,6 +38,8 @@ class Keys:
        FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
        USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
        TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
+        EXPERT_COUNT          = "{arch}.expert_count"
+        EXPERT_USED_COUNT     = "{arch}.expert_used_count"

    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
@ -93,6 +95,7 @@ class MODEL_ARCH(IntEnum):
    BLOOM     = auto()
    STABLELM  = auto()
    QWEN      = auto()
+    PHI2      = auto()


 class MODEL_TENSOR(IntEnum):
@ -111,10 +114,14 @@ class MODEL_TENSOR(IntEnum):
    ATTN_NORM       = auto()
    ATTN_NORM_2     = auto()
    ATTN_ROT_EMBD   = auto()
+    FFN_GATE_INP    = auto()
+    FFN_NORM        = auto()
    FFN_GATE        = auto()
    FFN_DOWN        = auto()
    FFN_UP          = auto()
-    FFN_NORM        = auto()
+    FFN_GATE_EXP    = auto()
+    FFN_DOWN_EXP    = auto()
+    FFN_UP_EXP      = auto()
    ATTN_Q_NORM     = auto()
    ATTN_K_NORM     = auto()

@ -134,6 +141,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.BLOOM:          "bloom",
    MODEL_ARCH.STABLELM:       "stablelm",
    MODEL_ARCH.QWEN:           "qwen",
+    MODEL_ARCH.PHI2:           "phi2",
 }

 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -154,10 +162,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
+    MODEL_TENSOR.FFN_GATE_INP:    "blk.{bid}.ffn_gate_inp",
    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
+    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate.{xid}",
+    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
+    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
 }

 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -172,10 +184,14 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
    ],
    MODEL_ARCH.GPTNEOX: [
        MODEL_TENSOR.TOKEN_EMBD,
@ -336,6 +352,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
    MODEL_ARCH.GPT2: [
        # TODO
    ],
+    MODEL_ARCH.PHI2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ]
    # TODO
 }

--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -339,6 +339,12 @@ class GGUFWriter:
    def add_clamp_kqv(self, value: float) -> None:
        self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)

+    def add_expert_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
+
+    def add_expert_used_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
+
    def add_layer_norm_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -17,6 +17,7 @@ class TensorNameMap:
            "tok_embeddings",                            # llama-pth
            "embeddings.word_embeddings",                # bert
            "language_model.embedding.word_embeddings",  # persimmon
+            "transformer.embd.wte",                      # phi2
        ),

        # Token type embeddings
@ -41,6 +42,7 @@ class TensorNameMap:
            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen
            "output",                    # llama-pth bloom
            "word_embeddings_for_head",  # persimmon
+            "lm_head.linear",            # phi2
        ),

        # Output norm
@ -53,6 +55,7 @@ class TensorNameMap:
            "transformer.norm_f",                      # mpt
            "ln_f",                                    # refact bloom qwen
            "language_model.encoder.final_layernorm",  # persimmon
+            "lm_head.ln",                              # phi2
        ),

        # Rope frequencies
@ -75,6 +78,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.LayerNorm",       # bert
            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
            "model.layers.{bid}.ln1",                               # yi
+            "transformer.h.{bid}.ln",                               # phi2
        ),

        # Attention norm 2
@ -90,6 +94,7 @@ class TensorNameMap:
            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
            "h.{bid}.self_attention.query_key_value",                              # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
+            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
        ),

        # Attention query
@ -128,6 +133,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.dense",                # bert
            "transformer.h.{bid}.attn.out_proj",                         # gpt-j
            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
+            "transformer.h.{bid}.mixer.out_proj",                        # phi2
        ),

        # Rotary embeddings
@ -149,6 +155,11 @@ class TensorNameMap:
            "model.layers.{bid}.ln2",                                        # yi
        ),

+        MODEL_TENSOR.FFN_GATE_INP: (
+            "layers.{bid}.feed_forward.gate",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
+        ),
+
        # Feed-forward up
        MODEL_TENSOR.FFN_UP: (
            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
@ -162,13 +173,24 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
            "transformer.h.{bid}.mlp.w1",                             # qwen
+            "transformer.h.{bid}.mlp.fc1",                            # phi2
+        ),
+
+        MODEL_TENSOR.FFN_UP_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
        ),

        # Feed-forward gate
        MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",  # llama-hf refact
-            "layers.{bid}.feed_forward.w1",      # llama-pth
-            "transformer.h.{bid}.mlp.w2",        # qwen
+            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
+            "layers.{bid}.feed_forward.w1",               # llama-pth
+            "transformer.h.{bid}.mlp.w2",                 # qwen
+        ),
+
+        MODEL_TENSOR.FFN_GATE_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
        ),

        # Feed-forward down
@ -183,6 +205,12 @@ class TensorNameMap:
            "encoder.layer.{bid}.output.dense",                       # bert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
+            "transformer.h.{bid}.mlp.fc2",                            # phi2
+        ),
+
+        MODEL_TENSOR.FFN_DOWN_EXP: (
+            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
+            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (
@ -213,11 +241,14 @@ class TensorNameMap:
            for tensor, keys in self.block_mappings_cfg.items():
                if tensor not in MODEL_TENSORS[arch]:
                    continue
-                tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
-                self.mapping[tensor_name] = (tensor, tensor_name)
-                for key in keys:
-                    key = key.format(bid = bid)
-                    self.mapping[key] = (tensor, tensor_name)
+                # TODO: make this configurable
+                n_experts = 8
+                for xid in range(n_experts):
+                    tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
+                    self.mapping[tensor_name] = (tensor, tensor_name)
+                    for key in keys:
+                        key = key.format(bid = bid, xid = xid)
+                        self.mapping[key] = (tensor, tensor_name)

    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
        result = self.mapping.get(key)
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@ -109,8 +109,10 @@ class SpecialVocab:
        return True

    def _set_special_token(self, typ: str, tid: Any) -> None:
-        if not isinstance(tid, int) or tid < 0:
+        if not isinstance(tid, int):
            return
+        if tid < 0:
+            raise ValueError(f'invalid value for special token type {typ}: {tid}')
        if self.n_vocab is None or tid < self.n_vocab:
            if typ in self.special_token_ids:
                return
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.6.0"
+version = "0.7.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/icon.png
+++ b/icon.png
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -39,10 +39,11 @@

 #define LLAMA_MAX_RNG_STATE (64*1024)

+#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
+#define LLAMA_SESSION_VERSION 3

 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@ -158,6 +159,22 @@ extern "C" {
        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;

+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_BOOL,
+    };
+
+    struct llama_model_kv_override {
+        char key[128];
+        enum llama_model_kv_override_type tag;
+        union {
+            int64_t int_value;
+            double float_value;
+            bool bool_value;
+        };
+    };
+
    struct llama_model_params {
        int32_t n_gpu_layers; // number of layers to store in VRAM
        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
@ -165,9 +182,13 @@ extern "C" {

        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
+
        // context pointer passed to the progress callback
        void * progress_callback_user_data;

+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
@ -191,11 +212,14 @@ extern "C" {
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size

+        enum ggml_type type_k; // data type for K cache
+        enum ggml_type type_v; // data type for V cache
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool embedding;  // embedding mode only
+        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embedding;   // embedding mode only
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
    };

    // model quantization parameters
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,5 @@
 numpy==1.24.4
 sentencepiece==0.1.98
+transformers>=4.34.0
 gguf>=0.1.0
+protobuf>=4.21.0
--- a/scripts/get-flags.mk
+++ b/scripts/get-flags.mk
@ -0,0 +1,38 @@
+ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
+	GF_CC_IS_GCC = 1
+	GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
+else
+	GF_CC_IS_CLANG = 1
+	ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
+		GF_CC_IS_LLVM_CLANG = 1
+	else
+		GF_CC_IS_APPLE_CLANG = 1
+	endif
+	GF_CC_VER := \
+		$(shell $(GF_CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
+		| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
+endif
+
+ifeq ($(GF_CC_IS_CLANG), 1)
+	# clang options
+	GF_CFLAGS   = -Wunreachable-code-break -Wunreachable-code-return
+	GF_CXXFLAGS = -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
+
+	ifneq '' '$(and $(GF_CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 030800)))'
+		GF_CFLAGS += -Wdouble-promotion
+	endif
+	ifneq '' '$(and $(GF_CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 070300)))'
+		GF_CFLAGS += -Wdouble-promotion
+	endif
+else
+	# gcc options
+	GF_CFLAGS   = -Wdouble-promotion
+	GF_CXXFLAGS = -Wno-array-bounds
+
+	ifeq ($(shell expr $(GF_CC_VER) \>= 070100), 1)
+		GF_CXXFLAGS += -Wno-format-truncation
+	endif
+	ifeq ($(shell expr $(GF_CC_VER) \>= 080100), 1)
+		GF_CXXFLAGS += -Wextra-semi
+	endif
+endif
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@ -20,5 +20,6 @@ cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
 cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
 cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h

-cp -rpv ../ggml/tests/test-opt.cpp    ./tests/test-opt.cpp
-cp -rpv ../ggml/tests/test-grad0.cpp  ./tests/test-grad0.cpp
+cp -rpv ../ggml/tests/test-opt.cpp         ./tests/test-opt.cpp
+cp -rpv ../ggml/tests/test-grad0.cpp       ./tests/test-grad0.cpp
+cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp
--- a/start-server-dialog.sh
+++ b/start-server-dialog.sh
@ -1,343 +0,0 @@
-#!/bin/bash
-
-# Set default values
-model_path="./models/"
-mmproj_path=""
-threads=4
-ctx_size=512
-batch_size=512
-n_gpu_layers=0
-cont_batching="off"
-mlock="off"
-no_mmap="off"
-host="127.0.0.1"
-port="8080"
-advanced_options=""
-
-
-
-model_selection() {
-    # User selects a file or folder
-  exec 3>&1
-  model_path=$(dialog --backtitle "Model Selection" \
-                      --title "Select Model File or Folder" \
-                      --fselect "$HOME/" 14 60 \
-                      2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # If a folder has been selected, search for *.gguf files
-  if [ -d "$model_path" ]; then
-    model_files=($(find "$model_path" -name "*.gguf" 2>/dev/null))
-  elif [ -f "$model_path" ]; then
-    model_files=("$model_path")
-  else
-    dialog --backtitle "Model Selection" \
-           --title "Invalid Selection" \
-           --msgbox "The selected path is not valid." 7 50
-    return
-  fi
-
-# Selection menu for models found
-exec 3>&1
-model_choice=$(dialog --backtitle "Model Selection" \
-                      --title "Select a Model File" \
-                      --menu "Choose one of the found models:" 15 60 4 \
-                      $(for i in "${!model_files[@]}"; do echo "$((i+1))" "$(basename "${model_files[$i]}")"; done) \
-                      2>&1 1>&3)
-exit_status=$?
-exec 3>&-
-
-# Check whether user has selected 'Cancel'
-if [ $exit_status = 1 ]; then
-  return
-fi
-
-# Set path to the selected model
-model_path=${model_files[$((model_choice-1))]}
-}
-
-
-
-multimodal_model_selection() {
-    # User selects a file or folder
-  exec 3>&1
-  mmproj_path=$(dialog --backtitle "Multimodal Model" \
-                      --title "Select Model File or Folder" \
-                      --fselect "$HOME/" 14 60 \
-                      2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # If a folder has been selected, search for *.bin files
-  if [ -d "$mmproj_path" ]; then
-    multi_modal_files=($(find "$mmproj_path" -name "*.bin" 2>/dev/null))
-  elif [ -f "$mmproj_path" ]; then
-    multi_modal_files=("$mmproj_path")
-  else
-    dialog --backtitle "Multimodal Model" \
-           --title "Invalid Selection" \
-           --msgbox "The selected path is not valid." 7 50
-    return
-  fi
-
-# Selection menu for models found
-exec 3>&1
-multi_modal_choice=$(dialog --backtitle "Multimodal Model" \
-                      --title "Select a Model File" \
-                      --menu "Choose one of the found models:" 15 60 4 \
-                      $(for i in "${!multi_modal_files[@]}"; do echo "$((i+1))" "$(basename "${multi_modal_files[$i]}")"; done) \
-                      2>&1 1>&3)
-exit_status=$?
-exec 3>&-
-
-# Check whether user has selected 'Cancel'
-if [ $exit_status = 1 ]; then
-  return
-fi
-
-# Set path to the selected model
-mmproj_path=${multi_modal_files[$((multi_modal_choice-1))]}
-}
-
-
-
-options() {
-  # Show form for entering the options
-  exec 3>&1
-  form_values=$(dialog --backtitle "Options Configuration" \
-                       --title "Set Options" \
-                       --form "Enter the values for the following options:" \
-                       15 50 0 \
-                       "Number of Threads (-t):" 1 1 "$threads" 1 25 25 5 \
-                       "Context Size (-c):" 2 1 "$ctx_size" 2 25 25 5 \
-                       "Batch Size (-b):" 3 1 "$batch_size" 3 25 25 5 \
-                       "GPU Layers (-ngl):" 4 1 "$n_gpu_layers" 4 25 25 5 \
-                       2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Save the entered values in the corresponding variables
-  IFS=$'\n' read -r threads ctx_size batch_size n_gpu_layers <<< "$form_values"
-}
-
-
-
-further_options() {
-  # Initial values for the checkboxes based on current settings
-  cb_value=$([ "$cont_batching" = "on" ] && echo "on" || echo "off")
-  mlock_value=$([ "$mlock" = "on" ] && echo "on" || echo "off")
-  no_mmap_value=$([ "$no_mmap" = "on" ] && echo "on" || echo "off")
-
-  # Show dialog for setting options
-  exec 3>&1
-  choices=$(dialog --backtitle "Further Options" \
-                   --title "Boolean Options" \
-                   --checklist "Select options:" 15 60 3 \
-                   "1" "Continuous Batching (-cb)" $cb_value \
-                   "2" "Memory Lock (--mlock)" $mlock_value \
-                   "3" "No Memory Map (--no-mmap)" $no_mmap_value \
-                   2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Set options based on user selection
-  cont_batching="off"
-  mlock="off"
-  no_mmap="off"
-  for choice in $choices; do
-    case $choice in
-      1) cont_batching="on" ;;
-      2) mlock="on" ;;
-      3) no_mmap="on" ;;
-    esac
-  done
-}
-
-
-
-advanced_options() {
-  # Input fields for Advanced Options
-  exec 3>&1
-  advanced_values=$(dialog --backtitle "Advanced Options" \
-                           --title "Advanced Server Configuration" \
-                           --form "Enter the advanced configuration options:" \
-                           15 60 0 \
-                           "Host IP:" 1 1 "$host" 1 15 15 0 \
-                           "Port:" 2 1 "$port" 2 15 5 0 \
-                           "Additional Options:" 3 1 "$advanced_options" 3 15 30 0 \
-                           2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Read the entries and save them in the corresponding variables
-  read -r host port advanced_options <<< "$advanced_values"
-}
-
-
-
-start_server() {
-  # Compiling the command with the selected options
-  cmd="./server"
-  [ -n "$model_path" ] && cmd+=" -m $model_path"
-  [ -n "$mmproj_path" ] && cmd+=" --mmproj $mmproj_path"
-  [ "$threads" -ne 4 ] && cmd+=" -t $threads"
-  [ "$ctx_size" -ne 512 ] && cmd+=" -c $ctx_size"
-  [ "$batch_size" -ne 512 ] && cmd+=" -b $batch_size"
-  [ "$n_gpu_layers" -ne 0 ] && cmd+=" -ngl $n_gpu_layers"
-  [ "$cont_batching" = "on" ] && cmd+=" -cb"
-  [ "$mlock" = "on" ] && cmd+=" --mlock"
-  [ "$no_mmap" = "off" ] && cmd+=" --no-mmap"
-  [ -n "$host" ] && cmd+=" --host $host"
-  [ -n "$port" ] && cmd+=" --port $port"
-  [ -n "$advanced_options" ] && cmd+=" $advanced_options"
-
-    eval "$cmd"
-  read -p 'Do not forget to quit the server later with Ctrl+C as soon as you are finished. Press Enter to continue...'
-}
-
-
-
-# Function to save the current configuration
-save_config() {
-  exec 3>&1
-  config_file=$(dialog --backtitle "Save Configuration" \
-                       --title "Save Configuration File" \
-                       --fselect "$HOME/" 14 60 \
-                       2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Saving the configuration to the file
-  cat > "$config_file" << EOF
-model_path=$model_path
-mmproj_path=$mmproj_path
-threads=$threads
-ctx_size=$ctx_size
-batch_size=$batch_size
-n_gpu_layers=$n_gpu_layers
-cont_batching=$cont_batching
-mlock=$mlock
-no_mmap=$no_mmap
-host=$host
-port=$port
-advanced_options=$advanced_options
-EOF
-
-  dialog --backtitle "Save Configuration" \
-         --title "Configuration Saved" \
-         --msgbox "Configuration has been saved to $config_file" 7 50
-}
-
-
-
-# Function for loading the configuration from a file
-load_config() {
-  exec 3>&1
-  config_file=$(dialog --backtitle "Load Configuration" \
-                       --title "Load Configuration File" \
-                       --fselect "$HOME/" 14 60 \
-                       2>&1 1>&3)
-  exit_status=$?
-  exec 3>&-
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Check whether the configuration file exists
-  if [ ! -f "$config_file" ]; then
-    dialog --backtitle "Load Configuration" \
-           --title "File Not Found" \
-           --msgbox "The file $config_file was not found." 7 50
-    return
-  fi
-
-  # Load configuration from the file
-  source "$config_file"
-
-  dialog --backtitle "Load Configuration" \
-         --title "Configuration Loaded" \
-         --msgbox "Configuration has been loaded from $config_file" 7 50
-}
-
-
-
-# Function to show the main menu
-show_main_menu() {
-  while true; do
-    exec 3>&1
-    selection=$(dialog \
-      --backtitle "Server Configuration" \
-      --title "Main Menu" \
-      --clear \
-      --cancel-label "Exit" \
-      --menu "Please select:" 15 50 6 \
-      "1" "Model Selection" \
-      "2" "Multimodal Model Selection" \
-      "3" "Options" \
-      "4" "Further Options" \
-      "5" "Advanced Options" \
-      "6" "Save Config" \
-      "7" "Load Config" \
-      "8" "Start Server" \
-      2>&1 1>&3)
-    exit_status=$?
-    exec 3>&-
-
-    # Check whether user has selected 'Exit'
-    if [ $exit_status = 1 ]; then
-      clear
-      exit
-    fi
-
-    # Call up the corresponding function based on the selection
-    case $selection in
-      1) model_selection ;;
-      2) multimodal_model_selection ;;
-      3) options ;;
-      4) further_options ;;
-      5) advanced_options ;;
-      6) save_config ;;
-      7) load_config ;;
-      8) start_server ;;
-      *) clear ;;
-    esac
-  done
-}
-
-
-
-# Show main menu
-show_main_menu
--- a/start-server-zenity.sh
+++ b/start-server-zenity.sh
@ -1,294 +0,0 @@
-#!/bin/bash
-
-# Set default values
-model_path="./models/"
-mmproj_path=""
-threads=4
-ctx_size=512
-batch_size=512
-n_gpu_layers=0
-cont_batching="off"
-mlock="off"
-no_mmap="off"
-host="127.0.0.1"
-port="8080"
-advanced_options=""
-
-
-
-# Function to install Zenity
-install_zenity() {
-    echo "Try to install Zenity with $1..."
-    if ! $1 install zenity -y; then
-        echo "Error: Zenity could not be installed."
-        exit 1
-    fi
-    echo "Zenity was successfully installed."
-}
-
-# Check whether Zenity is already installed
-if ! command -v zenity &> /dev/null; then
-    # Zenity is not installed, try to find the package manager
-    PACKAGE_MANAGERS=(brew apt apt-get yum pacman)
-    for manager in "${PACKAGE_MANAGERS[@]}"; do
-        if command -v $manager &> /dev/null; then
-            # Package manager found, ask the user for permission
-            read -p "Zenity is not installed. Would you like to install Zenity with $manager? (y/N) " response
-            if [[ "$response" =~ ^[Yy]$ ]]; then
-                # User has agreed, install Zenity
-                install_zenity $manager
-                break
-            else
-                echo "Installation canceled."
-                exit 1
-            fi
-        fi
-    done
-    if ! command -v zenity &> /dev/null; then
-        echo "No supported package manager found or Zenity could not be installed. Please install Zenity manually."
-        exit 1
-    fi
-fi
-
-
-
-model_selection() {
-  # User selects a file or folder
-  model_path=$(zenity --file-selection --title="Select Model File or Folder" --filename="$HOME/" --file-filter="*.gguf" --file-filter="*" --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # If a folder has been selected, search for *.gguf files
-  if [ -d "$model_path" ]; then
-    model_files=($(find "$model_path" -name "*.gguf" 2>/dev/null))
-  elif [ -f "$model_path" ]; then
-    model_files=("$model_path")
-  else
-    zenity --error --title="Invalid Selection" --text="The selected path is not valid."
-    return
-  fi
-
-  # Selection menu for models found
-  model_choice=$(zenity --list --title="Select a Model File" --column="Index" --column="Model File" $(for i in "${!model_files[@]}"; do echo "$((i+1))" "$(basename "${model_files[$i]}")"; done) --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Set path to the selected model
-  model_path=${model_files[$((model_choice-1))]}
-}
-
-
-
-multimodal_model_selection() {
-  # User selects a file or folder
-  mmproj_path=$(zenity --file-selection --title="Select Multimodal Model File or Folder" --filename="$HOME/" --file-filter="*.bin" --file-filter="*" --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # If a folder has been selected, search for *.bin files
-  if [ -d "$mmproj_path" ]; then
-    multi_modal_files=($(find "$mmproj_path" -name "*.bin" 2>/dev/null))
-  elif [ -f "$mmproj_path" ]; then
-    multi_modal_files=("$mmproj_path")
-  else
-    zenity --error --title="Invalid Selection" --text="The selected path is not valid."
-    return
-  fi
-
-  # Selection menu for models found
-  multi_modal_choice=$(zenity --list --title="Select a Multimodal Model File" --column="Index" --column="Model File" $(for i in "${!multi_modal_files[@]}"; do echo "$((i+1))" "$(basename "${multi_modal_files[$i]}")"; done) --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Set path to the selected model
-  mmproj_path=${multi_modal_files[$((multi_modal_choice-1))]}
-}
-
-
-
-options() {
-  # Show form for entering the options
-  form_values=$(zenity --forms --title="Set Options" --text="Enter the values for the following options:" --add-entry="Number of Threads (-t):" --add-entry="Context Size (-c):" --add-entry="Batch Size (-b):" --add-entry="GPU Layers (-ngl):" --separator="|" --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Save the entered values in the corresponding variables
-  IFS="|" read -r threads ctx_size batch_size n_gpu_layers <<< "$form_values"
-}
-
-
-
-further_options() {
-  # Initial values for the checkboxes based on current settings
-  cb_value=$([ "$cont_batching" = "on" ] && echo "TRUE" || echo "FALSE")
-  mlock_value=$([ "$mlock" = "on" ] && echo "TRUE" || echo "FALSE")
-  no_mmap_value=$([ "$no_mmap" = "on" ] && echo "TRUE" || echo "FALSE")
-
-  # Show dialog for setting options
-  choices=$(zenity --list --title="Boolean Options" --text="Select options:" --checklist --column="Select" --column="Option" TRUE "Continuous Batching (-cb)" FALSE "Memory Lock (--mlock)" FALSE "No Memory Map (--no-mmap)" --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Set options based on user selection
-  cont_batching="off"
-  mlock="off"
-  no_mmap="off"
-  for choice in $choices; do
-    case $choice in
-      "Continuous Batching (-cb)") cont_batching="on" ;;
-      "Memory Lock (--mlock)") mlock="on" ;;
-      "No Memory Map (--no-mmap)") no_mmap="on" ;;
-    esac
-  done
-}
-
-
-
-advanced_options() {
-  # Input fields for Advanced Options
-  advanced_values=$(zenity --forms --title="Advanced Server Configuration" --text="Enter the advanced configuration options:" --add-entry="Host IP:" --add-entry="Port:" --add-entry="Additional Options:" --separator="|" --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Read the entries and save them in the corresponding variables
-  IFS="|" read -r host port advanced_options <<< "$advanced_values"
-}
-
-
-
-start_server() {
-  # Compiling the command with the selected options
-  cmd="./server"
-  [ -n "$model_path" ] && cmd+=" -m $model_path"
-  [ -n "$mmproj_path" ] && cmd+=" --mmproj $mmproj_path"
-  [ "$threads" -ne 4 ] && cmd+=" -t $threads"
-  [ "$ctx_size" -ne 512 ] && cmd+=" -c $ctx_size"
-  [ "$batch_size" -ne 512 ] && cmd+=" -b $batch_size"
-  [ "$n_gpu_layers" -ne 0 ] && cmd+=" -ngl $n_gpu_layers"
-  [ "$cont_batching" = "on" ] && cmd+=" -cb"
-  [ "$mlock" = "on" ] && cmd+=" --mlock"
-  [ "$no_mmap" = "off" ] && cmd+=" --no-mmap"
-  [ -n "$host" ] && cmd+=" --host $host"
-  [ -n "$port" ] && cmd+=" --port $port"
-  [ -n "$advanced_options" ] && cmd+=" $advanced_options"
-
-  eval "$cmd"
-  read -p 'Press Enter to continue...'
-}
-
-
-
-# Function to save the current configuration
-save_config() {
-  config_file=$(zenity --file-selection --title="Save Configuration File" --filename="$HOME/" --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Saving the configuration to the file
-  cat > "$config_file" << EOF
-model_path=$model_path
-mmproj_path=$mmproj_path
-threads=$threads
-ctx_size=$ctx_size
-batch_size=$batch_size
-n_gpu_layers=$n_gpu_layers
-cont_batching=$cont_batching
-mlock=$mlock
-no_mmap=$no_mmap
-host=$host
-port=$port
-advanced_options=$advanced_options
-EOF
-
-  zenity --info --title="Configuration Saved" --text="Configuration has been saved to $config_file" --width=300 --height=400
-}
-
-
-
-# Function for loading the configuration from a file
-load_config() {
-  config_file=$(zenity --file-selection --title="Load Configuration File" --filename="$HOME/" --width=300 --height=400)
-  exit_status=$?
-
-  # Check whether user has selected 'Cancel'
-  if [ $exit_status = 1 ]; then
-    return
-  fi
-
-  # Check whether the configuration file exists
-  if [ ! -f "$config_file" ]; then
-    zenity --error --title="File Not Found" --text="The file $config_file was not found." --width=300 --height=400
-    return
-  fi
-
-  # Load configuration from the file
-  source "$config_file"
-
-  zenity --info --title="Configuration Loaded" --text="Configuration has been loaded from $config_file" --width=300 --height=400
-}
-
-
-
-# Function to show the main menu
-show_main_menu() {
-  while true; do
-    selection=$(zenity --list --title="Main Menu" --text="Please select:" --cancel-label="Exit" --column="Index" --column="Option" 1 "Model Selection" 2 "Multimodal Model Selection" 3 "Options" 4 "Further Options" 5 "Advanced Options" 6 "Save Config" 7 "Load Config" 8 "Start Server" --width=300 --height=400)
-    exit_status=$?
-
-    # Check whether user has selected 'Exit'
-    if [ $exit_status = 1 ]; then
-      clear
-      exit
-    fi
-
-    # Call up the corresponding function based on the selection
-    case $selection in
-      1) model_selection ;;
-      2) multimodal_model_selection ;;
-      3) options ;;
-      4) further_options ;;
-      5) advanced_options ;;
-      6) save_config ;;
-      7) load_config ;;
-      8) start_server ;;
-      *) clear ;;
-    esac
-  done
-}
-
-
-
-# Show main menu
-show_main_menu
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -22,26 +22,32 @@ endfunction()
 llama_build_and_test_executable(test-quantize-fns.cpp)
 llama_build_and_test_executable(test-quantize-perf.cpp)
 llama_build_and_test_executable(test-sampling.cpp)
+
 llama_build_executable(test-tokenizer-0-llama.cpp)
 llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+
 llama_build_executable(test-tokenizer-0-falcon.cpp)
 llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+
 llama_build_executable(test-tokenizer-1-llama.cpp)
-llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+llama_test_executable (test-tokenizer-1-llama    test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+
 llama_build_executable(test-tokenizer-1-bpe.cpp)
-llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
-llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+llama_test_executable (test-tokenizer-1-falcon           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test_executable (test-tokenizer-1-aquila           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_test_executable (test-tokenizer-1-mpt              test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
+llama_test_executable (test-tokenizer-1-gpt-neox         test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+llama_test_executable (test-tokenizer-1-refact           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+llama_test_executable (test-tokenizer-1-starcoder        test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+# llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
-llama_build_and_test_executable(test-grad0.cpp) # SLOW
+llama_build_and_test_executable(test-grad0.cpp)
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
+llama_build_and_test_executable(test-backend-ops.cpp)

 llama_build_and_test_executable(test-rope.cpp)

--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -1,4 +1,4 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #include "ggml.h"

 #include <cmath>
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -117,7 +117,7 @@ static void usage(char * argv[]) {
    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
-    printf("  --op OP               set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
+    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
    printf("  --type TYPE           set test type as");
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
@ -202,7 +202,7 @@ int main(int argc, char * argv[]) {
            }
            int alignment = std::stoi(argv[i]);
            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
-            fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
+            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
                invalid_param = true;
                break;
            }
@ -286,7 +286,7 @@ int main(int argc, char * argv[]) {
                        qfns.from_float_reference(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -300,7 +300,7 @@ int main(int argc, char * argv[]) {
                        qfns.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -315,7 +315,7 @@ int main(int argc, char * argv[]) {
                        qfns.to_float(test_q1, test_out, size);
                        return test_out[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -330,7 +330,7 @@ int main(int argc, char * argv[]) {
                        vdot.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -347,7 +347,7 @@ int main(int argc, char * argv[]) {
                        qfns.vec_dot(size, &result, test_q1, test_q2);
                        return result;
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");