Merge branch 'github' of https://gitlab.vinai.io/mlbooster/llama.cpp into feature/awq_pr

2023-12-20 11:21:16 +07:00 · 2023-12-20 11:21:16 +07:00 · 741b7fb59b
commit 741b7fb59b
parent 71c0a27fdf 328b83de23
51 changed files with 6069 additions and 1715 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -15,8 +15,14 @@ indent_size = 4
 [Makefile]
 indent_style = tab
 [scripts/*.mk]
 indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -291,7 +291,12 @@ if (LLAMA_CUBLAS)
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
        if (LLAMA_STATIC)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            if (WIN32)
                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
            endif()
        else()
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()
@ -397,57 +402,102 @@ if (LLAMA_HIPBLAS)
    endif()
 endif()
-if (LLAMA_ALL_WARNINGS)
+function(get_flags CCID CCVER)
-    if (NOT MSVC)
+    set(C_FLAGS "")
-        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+    set(CXX_FLAGS "")
        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
        set(host_cxx_flags "")
-        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+    if (CCID MATCHES "Clang")
-            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
+        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
+        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-            if (
+        if (
-                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
+            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-                (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
+            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-            )
+        )
-                set(c_flags ${c_flags} -Wdouble-promotion)
+            set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
-            endif()
+        endif()
-        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+    elseif (CCID STREQUAL "GNU")
-            set(c_flags ${c_flags} -Wdouble-promotion)
+        set(C_FLAGS   -Wdouble-promotion)
-            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
+        set(CXX_FLAGS -Wno-array-bounds)
-
+
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
+        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
+            set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
-            endif()
+        endif()
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
+        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
+            set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
            endif()
        endif()
    else()
        # todo : msvc
    endif()
-    set(c_flags   ${c_flags}   ${warning_flags})
+    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(cxx_flags ${cxx_flags} ${warning_flags})
+    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+endfunction()
                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
        set(C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
                          -Werror=implicit-int -Werror=implicit-function-declaration)
        set(CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
        set(C_FLAGS   ${WARNING_FLAGS} ${C_FLAGS})
        set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
    else()
        # todo : msvc
        set(C_FLAGS   "")
        set(CXX_FLAGS "")
    endif()
 endif()
-if (NOT MSVC)
+if (LLAMA_CUBLAS)
-    set(cuda_flags -Wno-pedantic)
+    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
-endif()
+    if (NOT MSVC)
-set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
+        set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
    endif()
-list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
+    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
-if (NOT cuda_host_flags STREQUAL "")
+        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
+        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-endif()
+            set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
        endif()
-add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
+        execute_process(
            COMMAND ${NVCC_CMD} -Xcompiler --version
            OUTPUT_VARIABLE CUDA_CCFULLVER
            ERROR_QUIET
        )
        if (NOT CUDA_CCFULLVER MATCHES clang)
            set(CUDA_CCID "GNU")
            execute_process(
                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
                OUTPUT_VARIABLE CUDA_CCVER
                ERROR_QUIET
            )
        else()
            if (CUDA_CCFULLVER MATCHES Apple)
                set(CUDA_CCID "AppleClang")
            else()
                set(CUDA_CCID "Clang")
            endif()
            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
        endif()
        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
        get_flags(${CUDA_CCID} ${CUDA_CCVER})
        list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS)  # pass host compiler flags as a single argument
        if (NOT CUDA_CXX_FLAGS STREQUAL "")
            set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
        endif()
    endif()
    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
 endif()
 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@ -471,6 +521,7 @@ endif()
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
    ERROR_VARIABLE output
    OUTPUT_QUIET
 )
 if (output MATCHES "dyld-1015\.7")
    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
@ -593,6 +644,11 @@ else()
    message(STATUS "Unknown architecture")
 endif()
 if (MINGW)
    # Target Windows 8 for PrefetchVirtualMemory
    add_compile_definitions(_WIN32_WINNT=0x602)
 endif()
 #
 # POSIX conformance
 #
--- a/152
+++ b/152
@ -26,20 +26,6 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 ifeq '' '$(findstring clang,$(shell $(CC) --version))'
 	CC_IS_GCC=1
 	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 else
 	CC_IS_CLANG=1
 	ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
 		CC_IS_LLVM_CLANG=1
 	else
 		CC_IS_APPLE_CLANG=1
 	endif
 	CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
 				| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 endif
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@ -121,12 +107,12 @@ MK_CXXFLAGS = -std=c++11 -fPIC
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
-MK_CFLAGS        += -Ofast
+MK_CFLAGS     += -Ofast
-MK_HOST_CXXFLAGS += -Ofast
+HOST_CXXFLAGS += -Ofast
-MK_CUDA_CXXFLAGS += -O3
+MK_NVCCFLAGS  += -O3
 else
-MK_CFLAGS        += -O3
+MK_CFLAGS     += -O3
-MK_CXXFLAGS      += -O3
+MK_CXXFLAGS   += -O3
 endif
 # clock_gettime came in POSIX.1b (1993)
@ -220,30 +206,6 @@ MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
 				-Werror=implicit-function-declaration
 MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
 ifeq ($(CC_IS_CLANG), 1)
 	# clang options
 	MK_CFLAGS        += -Wunreachable-code-break -Wunreachable-code-return
 	MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
 	ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
 		MK_CFLAGS += -Wdouble-promotion
 	endif
 	ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
 		MK_CFLAGS += -Wdouble-promotion
 	endif
 else
 	# gcc options
 	MK_CFLAGS        += -Wdouble-promotion
 	MK_HOST_CXXFLAGS += -Wno-array-bounds
 	ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
 		MK_HOST_CXXFLAGS += -Wno-format-truncation
 	endif
 	ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
 		MK_HOST_CXXFLAGS += -Wextra-semi
 	endif
 endif
 # this version of Apple ld64 is buggy
 ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
 	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@ -294,8 +256,8 @@ ifndef RISCV
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
-	MK_CFLAGS   += -march=native -mtune=native
+	MK_CFLAGS     += -march=native -mtune=native
-	MK_HOST_CXXFLAGS += -march=native -mtune=native
+	HOST_CXXFLAGS += -march=native -mtune=native
 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
@ -306,12 +268,15 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#MK_CXXFLAGS += -mssse3
 endif
 # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
 # https://github.com/ggerganov/llama.cpp/issues/2922
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
 	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
 	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
 	# https://github.com/ggerganov/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
 	# Target Windows 8 for PrefetchVirtualMemory
 	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
@ -395,61 +360,64 @@ ifdef LLAMA_CUBLAS
 	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS         += ggml-cuda.o
-	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
+	MK_NVCCFLAGS  = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
 endif
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
 	NVCC = nvcc
 endif #LLAMA_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
-	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
-else ifdef CUDA_POWER_ARCH
+else ifndef CUDA_POWER_ARCH
-	NVCCFLAGS +=
+	MK_NVCCFLAGS += -arch=native
 else
 	NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
-	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_FORCE_MMQ
-	NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # LLAMA_CUDA_FORCE_MMQ
 ifdef LLAMA_CUDA_DMMV_X
-	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
-	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
 ifdef LLAMA_CUDA_MMV_Y
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 else ifdef LLAMA_CUDA_DMMV_Y
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
 else
-	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
 ifdef LLAMA_CUDA_F16
-	NVCCFLAGS += -DGGML_CUDA_F16
+	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_F16
 ifdef LLAMA_CUDA_DMMV_F16
-	NVCCFLAGS += -DGGML_CUDA_F16
+	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_DMMV_F16
 ifdef LLAMA_CUDA_KQUANTS_ITER
-	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
-	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
 else
-	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 #ifdef LLAMA_CUDA_CUBLAS
-#	NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
-	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
+	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+	$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # LLAMA_CUBLAS
 ifdef LLAMA_CLBLAST
@ -471,9 +439,15 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 endif # LLAMA_CLBLAST
 ifdef LLAMA_HIPBLAS
-	ROCM_PATH	?= /opt/rocm
+
-	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
+	ifeq ($(wildcard /opt/rocm),)
-	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+		ROCM_PATH	?= /usr
 		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
 	else
 		ROCM_PATH	?= /opt/rocm
 		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
 	HIPCC                   ?= $(ROCM_PATH)/bin/hipcc
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
@ -511,16 +485,22 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
-# combine build flags with cmdline overrides
+GF_CC := $(CC)
-override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
+include scripts/get-flags.mk
 override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
 override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
 override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
 override LDFLAGS       := $(MK_LDFLAGS) $(LDFLAGS)
-# save CXXFLAGS before we add host-only options
+# combine build flags with cmdline overrides
-NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
+override CFLAGS    := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
-override CXXFLAGS += $(HOST_CXXFLAGS)
+BASE_CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
 override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
 override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
 # identify CUDA host compiler
 ifdef LLAMA_CUBLAS
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(GF_CXXFLAGS)
 endif
 #
 # Print build information
@ -730,16 +710,16 @@ tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
 tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
--- a/README.md
+++ b/README.md
@ -10,10 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
- **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
+- Collecting Apple Silicon performance stats:
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
+  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
 - Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
 ----
@ -96,7 +97,18 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
 - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
 - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
 - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 **Multimodal models:**
 - [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
 - [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
 - [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
 **Bindings:**
--- a/common/common.cpp
+++ b/common/common.cpp
@ -658,6 +658,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "-h" || arg == "--help") {
            return false;
        } else if (arg == "--version") {
            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@ -796,6 +800,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help            show this help message and exit\n");
    printf("      --version         show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
--- a/common/log.h
+++ b/common/log.h
@ -61,13 +61,13 @@
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
-//  The log target can also be redirected to a diffrent function
+//  The log target can also be redirected to a different function
 //  like so:
 //
-//  #define LOG_TARGET log_handler_diffrent()
+//  #define LOG_TARGET log_handler_different()
 //  #include "log.h"
 //
-//  FILE* log_handler_diffrent()
+//  FILE* log_handler_different()
 //  {
 //      return stderr;
 //  }
@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
-//  untill enabled back.
+//  until enabled back.
 #define log_disable() log_disable_impl()
 // INTERNAL, DO NOT USE
--- a/common/train.cpp
+++ b/common/train.cpp
@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
    float scale = 1.0f; // xavier
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
        case 1:
            scale /= sqrtf((float) tensor->ne[0]);
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
 }
 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
        case 1:
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
 }
 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
    GGML_ASSERT(tensor->n_dims == 1);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == 1);
    GGML_ASSERT(tensor->ne[2] == 1);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
    GGML_ASSERT(tensor->n_dims == 2);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == 1);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
    GGML_ASSERT(tensor->n_dims == 3);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
    GGML_ASSERT(tensor->ne[3] == 1);
 }
 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
    GGML_ASSERT(tensor->n_dims == 4);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
    bool                   sample_random_offsets
 ) {
    GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(target_probs->n_dims  == 3);
+    GGML_ASSERT(ggml_is_3d(target_probs));
    int64_t n_vocab  = target_probs->ne[0];
    int64_t n_tokens = tokens_input->ne[0];
    int64_t n_batch  = tokens_input->ne[1];
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -77,8 +77,18 @@ class Model:
            self.gguf_writer.add_embedding_length(n_embd)
        if (n_ff := self.hparams.get("intermediate_size")) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
-        if (n_head := self.hparams.get("num_attention_head")) is not None:
+        if (n_head := self.hparams.get("num_attention_heads")) is not None:
            self.gguf_writer.add_head_count(n_head)
        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)
        if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
            self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
        if (n_experts := self.hparams.get("num_local_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
    def write_tensors(self):
@ -170,6 +180,10 @@ class Model:
            return StableLMModel
        if model_architecture == "QWenLMHeadModel":
            return QwenModel
        if model_architecture == "MixtralForCausalLM":
            return MixtralModel
        if model_architecture == "PhiForCausalLM":
            return Phi2Model
        return Model
    def _is_model_safetensors(self) -> bool:
@ -207,6 +221,10 @@ class Model:
            return gguf.MODEL_ARCH.STABLELM
        if arch == "QWenLMHeadModel":
            return gguf.MODEL_ARCH.QWEN
        if arch == "MixtralForCausalLM":
            return gguf.MODEL_ARCH.LLAMA
        if arch == "PhiForCausalLM":
            return gguf.MODEL_ARCH.PHI2
        raise NotImplementedError(f'Architecture "{arch}" not supported!')
@ -841,6 +859,11 @@ class StableLMModel(Model):
        self.gguf_writer.add_layer_norm_eps(1e-5)
 class MixtralModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()
 class QwenModel(Model):
    @staticmethod
    def token_bytes_to_string(b):
@ -965,6 +988,24 @@ class QwenModel(Model):
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
 class Phi2Model(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams["n_layer"]
        self.gguf_writer.add_name("Phi2")
        self.gguf_writer.add_context_length(self.hparams["n_positions"])
        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(self.hparams["n_head"])
        self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_add_bos_token(False)
 ###### CONVERSION LOGIC ######
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@ -3,7 +3,6 @@ from __future__ import annotations
 import json
 import os
 import re
 import struct
 import sys
 from typing import Any, BinaryIO, Sequence
@ -11,43 +10,15 @@ from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch
 from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
 HF_SUBLAYER_TO_GGML = {
    "self_attn.q_proj": "attn_q",
    "self_attn.k_proj": "attn_k",
    "self_attn.v_proj": "attn_v",
    "self_attn.o_proj": "attn_output",
    "mlp.gate_proj": "ffn_gate",
    "mlp.down_proj": "ffn_down",
    "mlp.up_proj": "ffn_up",
    "input_layernorm": "attn_norm",
    "post_attention_layernorm": "ffn_norm",
 }
 def translate_tensor_name(t: str) -> str:
    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
    if match:
        nn = match.group(1)
        sub_layer = match.group(2)
        lora_type = match.group(3)
        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
        if sub_layer_renamed is None:
            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
            sys.exit(1)
        output_string = (
            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
        )
        return output_string
    else:
        print(f"Error: unrecognized tensor {t}")
        sys.exit(1)
 def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(struct.pack("i", int(params["lora_alpha"])))
-def write_tensor_header(
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
@ -78,11 +47,12 @@ def write_tensor_header(
    fout.seek((fout.tell() + 31) & -32)
-if len(sys.argv) != 2:
+if len(sys.argv) < 2:
-    print(f"Usage: python {sys.argv[0]} <path>")
+    print(f"Usage: python {sys.argv[0]} <path> [arch]")
    print(
        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
    )
    print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
    sys.exit(1)
 input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -90,6 +60,14 @@ input_model = os.path.join(sys.argv[1], "adapter_model.bin")
 output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
 model = torch.load(input_model, map_location="cpu")
 arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
 if arch_name not in gguf.MODEL_ARCH_NAMES.values():
    print(f"Error: unsupported architecture {arch_name}")
    sys.exit(1)
 arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
 name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
 with open(input_json, "r") as f:
    params = json.load(f)
@ -117,6 +95,7 @@ with open(output_path, "wb") as fout:
    write_file_header(fout, params)
    for k, v in model.items():
        orig_k = k
        if k.endswith(".default.weight"):
            k = k.replace(".default.weight", ".weight")
        if k in ["llama_proj.weight", "llama_proj.bias"]:
@ -129,7 +108,32 @@ with open(output_path, "wb") as fout:
            v = v.float()
        t = v.detach().numpy()
-        tname = translate_tensor_name(k)
+
        prefix = "base_model.model."
        if k.startswith(prefix):
            k = k[len(prefix) :]
        lora_suffixes = (".lora_A.weight", ".lora_B.weight")
        if k.endswith(lora_suffixes):
            suffix = k[-len(lora_suffixes[0]):]
            k = k[: -len(lora_suffixes[0])]
        else:
            print(f"Error: unrecognized tensor name {orig_k}")
            sys.exit(1)
        tname = name_map.get_name(k)
        if tname is None:
            print(f"Error: could not map tensor name {orig_k}")
            print(" Note: the arch parameter must be specified if the model is not llama")
            sys.exit(1)
        if suffix == ".lora_A.weight":
            tname += ".weight.loraA"
        elif suffix == ".lora_B.weight":
            tname += ".weight.loraB"
        else:
            assert False
        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
        write_tensor_header(fout, tname, t.shape, t.dtype)
        t.tofile(fout)
--- a/convert.py
+++ b/convert.py
@ -10,6 +10,7 @@ import itertools
 import json
 import math
 import mmap
 import os
 import pickle
 import re
 import signal
@ -18,15 +19,15 @@ import sys
 import time
 import zipfile
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast
 import numpy as np
 from sentencepiece import SentencePieceProcessor
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@ -42,6 +43,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 ARCH = gguf.MODEL_ARCH.LLAMA
 DEFAULT_CONCURRENCY = 8
 #
 # data types
 #
@ -62,10 +64,10 @@ class UnquantizedDataType(DataType):
    pass
-DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
+DT_F16  = UnquantizedDataType('F16',  dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
-DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
+DT_F32  = UnquantizedDataType('F32',  dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
-DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
+DT_I32  = UnquantizedDataType('I32',  dtype = np.dtype(np.int16),   valid_conversions = [])
-DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
+DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16),  valid_conversions = ['F32', 'F16', 'Q8_0'])
@dataclass(frozen=True)
@ -151,14 +153,16 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
@dataclass
 class Params:
-    n_vocab:    int
+    n_vocab:        int
-    n_embd:     int
+    n_embd:         int
-    n_layer:    int
+    n_layer:        int
-    n_ctx:      int
+    n_ctx:          int
-    n_ff:       int
+    n_ff:           int
-    n_head:     int
+    n_head:         int
-    n_head_kv:  int
+    n_head_kv:      int
-    f_norm_eps: float
+    n_experts:      int | None = None
    n_experts_used: int | None = None
    f_norm_eps:     float | None = None
    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
@ -233,6 +237,13 @@ class Params:
            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
        n_experts      = None
        n_experts_used = None
        if "num_local_experts" in config:
            n_experts = config["num_local_experts"]
            n_experts_used = config["num_experts_per_tok"]
        return Params(
            n_vocab           = config["vocab_size"],
            n_embd            = config["hidden_size"],
@ -241,6 +252,8 @@ class Params:
            n_ff              = config["intermediate_size"],
            n_head            = (n_head := config["num_attention_heads"]),
            n_head_kv         = config.get("num_key_value_heads", n_head),
            n_experts         = n_experts,
            n_experts_used    = n_experts_used,
            f_norm_eps        = config["rms_norm_eps"],
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
@ -255,8 +268,15 @@ class Params:
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))
        n_experts      = None
        n_experts_used = None
        f_rope_freq_base = None
        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if config.get("rope_theta") == 1000000:
+        if config.get("moe"):
            # Mixtral
            n_ctx = 32768
        elif config.get("rope_theta") == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@ -266,16 +286,27 @@ class Params:
            # LLaMA v1
            n_ctx = 2048
        if "layers.0.feed_forward.w1.weight" in model:
            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
        if config.get("moe"):
            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
            n_experts      = config["moe"]["num_experts"]
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6
        return Params(
            n_vocab          = model["tok_embeddings.weight"].shape[0],
            n_embd           = config["dim"],
            n_layer          = config["n_layers"],
            n_ctx            = n_ctx,
-            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
+            n_ff             = n_ff,
            n_head           = (n_head := config["n_heads"]),
            n_head_kv        = config.get("n_kv_heads", n_head),
            n_experts        = n_experts,
            n_experts_used   = n_experts_used,
            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config.get("rope_theta"),
+            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
        )
    @staticmethod
@ -297,127 +328,138 @@ class Params:
        return params
-#
+class VocabLoader:
-# vocab
+    def __init__(self, params: Params, fname_tokenizer: Path) -> None:
-#
+        try:
            from transformers import AutoTokenizer
        except ImportError as e:
            raise ImportError(
                "To use VocabLoader, please install the `transformers` package. "
                "You can install it with `pip install transformers`."
            ) from e
-class BpeVocab:
+        try:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+            self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
-        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+        except ValueError:
-        added_tokens: dict[str, int]
+            self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
-        if fname_added_tokens is not None:
+
-            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+        self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
-            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+
        for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
            if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
                continue
            self.added_tokens_dict[tok] = tokidx
        self.unk_token_id: int = self.tokenizer.unk_token_id
        self.specials: dict[str, int] = {
            tok: self.tokenizer.get_vocab()[tok]
            for tok in self.tokenizer.all_special_tokens
        }
        self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
        self.vocab_size_base: int = self.tokenizer.vocab_size
        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
        self.fname_tokenizer: Path = fname_tokenizer
        vocab_file = "tokenizer.model"
        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
        if path_candidate is not None:
            self.spm = SentencePieceProcessor(str(path_candidate))
            print(self.spm.vocab_size(), self.vocab_size_base)
        else:
-            # Fall back to trying to find the added tokens in tokenizer.json
+            self.spm = None
            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
            if not tokenizer_json_file.is_file():
                added_tokens = {}
            else:
                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                added_tokens = dict(
                    (item['content'], item['id'])
                    for item in tokenizer_json.get('added_tokens', [])
                    # Added tokens here can be duplicates of the main vocabulary.
                    if item['content'] not in self.bpe_tokenizer)
-        vocab_size: int = len(self.bpe_tokenizer)
+    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+        tokenizer = self.tokenizer
-        actual_ids      = sorted(added_tokens.values())
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.get_vocab().items()}
-        if expected_ids != actual_ids:
+        added_tokens_ids = set(self.added_tokens_dict.values())
            expected_end_id = vocab_size + len(actual_ids) - 1
            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        for i in range(self.vocab_size_base):
-        self.added_tokens_list    = [text for (text, idx) in items]
+            if i in added_tokens_ids:
-        self.vocab_size_base: int = vocab_size
+                continue
        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer      = fname_tokenizer
        self.fname_added_tokens   = fname_added_tokens
-    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+            text = reverse_vocab[i].encode("utf-8")
-        tokenizer = self.bpe_tokenizer
+            yield text, self.get_token_score(i), self.get_token_type(i)
        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
-        for i, _ in enumerate(tokenizer):
+    def get_token_type(self, token_id: int) -> gguf.TokenType:
-            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
+        toktype = gguf.TokenType.NORMAL
-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        if self.spm is not None and token_id < self.spm.vocab_size():
-        for text in self.added_tokens_list:
+            if self.spm.is_unknown(token_id):
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.bpe_tokens()
        yield from self.added_tokens()
    def __repr__(self) -> str:
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 class SentencePieceVocab:
    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            added_tokens = {}
        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
        actual_new_ids   = sorted(new_tokens.keys())
        if expected_new_ids != actual_new_ids:
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
        # Token pieces that were added to the base vocabulary.
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens
    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.id_to_piece(i)
            text: bytes = piece.encode("utf-8")
            score: float = tokenizer.get_score(i)
            toktype = gguf.TokenType.NORMAL
            if tokenizer.is_unknown(i):
                toktype = gguf.TokenType.UNKNOWN
-            if tokenizer.is_control(i):
+            if self.spm.is_control(token_id):
                toktype = gguf.TokenType.CONTROL
            if self.spm.is_unused(token_id):
                toktype = gguf.TokenType.UNUSED
            if self.spm.is_byte(token_id):
                toktype = gguf.TokenType.BYTE
        else:
            if token_id == self.unk_token_id:
                toktype = gguf.TokenType.UNKNOWN
            if token_id in self.special_ids:
                toktype = gguf.TokenType.CONTROL
-            # NOTE: I think added_tokens are user defined.
+        return toktype
            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
-            if tokenizer.is_unused(i):
+    def get_token_score(self, token_id: int) -> float:
-                toktype = gguf.TokenType.UNUSED
+        if self.spm is not None and token_id < self.spm.vocab_size():
-            if tokenizer.is_byte(i):
+            return cast(float, self.spm.get_score(token_id))
-                toktype = gguf.TokenType.BYTE
+        return 0.0
            yield text, score, toktype
    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        for text in self.added_tokens_list:
+
-            score = -1000.0
+        for text in self.added_tokens_dict:
-            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+            if text in self.specials:
                toktype = self.get_token_type(self.specials[text])
                score = self.get_token_score(self.specials[text])
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
            yield text.encode("utf-8"), score, toktype
    def has_newline_token(self) -> bool:
        return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.sentencepiece_tokens()
+        yield from self.hf_tokens()
        yield from self.added_tokens()
    def get_vocab_type(self) -> str:
        path_candidates = []
        vocab_file = "tokenizer.model"
        path_candidates.append(vocab_file)
        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
        if path_candidate is not None:
            return "llama"
        vocab_file = "vocab.json"
        path_candidates.append(vocab_file)
        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
        if path_candidate is not None:
            return "gpt2"
        vocab_file = "tokenizer.json"
        path_candidates.append(vocab_file)
        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
        if path_candidate:
            if not self.has_newline_token():
                return "gpt2"
            return "llama"
        raise FileNotFoundError(
            f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
            "if it's in another directory, pass the directory as --vocab-dir"
        )
    def __repr__(self) -> str:
-        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
-Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
+Vocab: TypeAlias = 'VocabLoader'
 #
 # data loading
@ -585,7 +627,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
        # Transformers models put different tensors in different files, but
-        # don't split indivdual tensors between files.
+        # don't split individual tensors between files.
        model: LazyModel = {}
        for mp in models_plus:
            model.update(mp.model)
@ -678,7 +720,7 @@ class LazyUnpickler(pickle.Unpickler):
        return func(*args)
    CLASSES: dict[tuple[str, str], Any] = {
-        # getattr used here as a workaround for mypy not being smart enough to detrmine
+        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
@ -794,20 +836,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
            yield result
-def check_vocab_size(params: Params, vocab: Vocab) -> None:
+def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
    if params.n_vocab != vocab.vocab_size:
-        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
+        if params.n_vocab == vocab.vocab_size:
        if params.n_vocab == vocab.vocab_size_base:
            print("Ignoring added_tokens.json since model matches vocab size without it.")
-            vocab.added_tokens_list = []
+            vocab.added_tokens_dict = OrderedDict()
-            vocab.vocab_size = vocab.vocab_size_base
+            vocab.vocab_size = vocab.vocab_size
            return
        if pad_vocab and params.n_vocab > vocab.vocab_size:
            pad_count = params.n_vocab - vocab.vocab_size
            print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
            for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
                vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
            vocab.vocab_size = params.n_vocab
            return
        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
        if vocab.fname_added_tokens is not None:
            msg += f" combined with {vocab.fname_added_tokens}"
        msg += f" has {vocab.vocab_size})."
-        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
        if vocab.vocab_size < params.n_vocab:
            msg += " Possibly try using the --padvocab option."
        raise Exception(msg)
@ -832,7 +881,17 @@ class OutputFile:
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
-        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
+
        if params.n_experts:
            self.gguf.add_expert_count(params.n_experts)
        if params.n_experts_used:
            self.gguf.add_expert_used_count(params.n_experts_used)
        if params.f_norm_eps:
            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
        else:
            raise ValueError('f_norm_eps is None')
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@ -861,12 +920,8 @@ class OutputFile:
            scores.append(score)
            toktypes.append(toktype)
-        if isinstance(vocab, SentencePieceVocab):
+        vocab_type = vocab.get_vocab_type()
-            self.gguf.add_tokenizer_model("llama")
+        self.gguf.add_tokenizer_model(vocab_type)
        elif isinstance(vocab, BpeVocab):
            self.gguf.add_tokenizer_model("gpt2")
        else:
            raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
        self.gguf.add_token_list(tokens)
        self.gguf.add_token_scores(scores)
        self.gguf.add_token_types(toktypes)
@ -892,8 +947,12 @@ class OutputFile:
        self.gguf.close()
    @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+    def write_vocab_only(
-        check_vocab_size(params, vocab)
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
        of = OutputFile(fname_out, endianess=endianess)
@ -920,8 +979,13 @@ class OutputFile:
        return dt.quantize(arr)
    @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+    def write_all(
-        check_vocab_size(params, vocab)
+        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY,
        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
        of = OutputFile(fname_out, endianess=endianess)
@ -956,7 +1020,7 @@ class OutputFile:
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
-    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
+    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
        return GGMLFileType.AllF32
@ -1079,35 +1143,17 @@ def load_some_model(path: Path) -> ModelPlus:
    return model_plus
-def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
+def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
-    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    path2 = path / vocab_file
-    # a directory, it might be the model directory, and tokenizer.model might
+    # Use `.parent` instead of /.. to handle the symlink case better.
-    # be in the parent of that.
+    path3 = path.parent / vocab_file
    if path.is_dir():
        vocab_file = "tokenizer.model"
        if vocabtype == 'bpe':
            vocab_file = "vocab.json"
        path2 = path / vocab_file
        # Use `.parent` instead of /.. to handle the symlink case better.
        path3 = path.parent / vocab_file
        if path2.exists():
            path = path2
        elif path3.exists():
            path = path3
        else:
            raise FileNotFoundError(
                f"Could not find {vocab_file} in {path} or its parent; "
                "if it's in another directory, pass the directory as --vocab-dir")
-    print(f"Loading vocab file '{path}', type '{vocabtype}'")
+    if path2.exists():
        return path2
    if path3.exists():
        return path3
-    added_tokens_path = path.parent / "added_tokens.json"
+    return None
    if vocabtype == "bpe":
        return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
    elif vocabtype == "spm":
        return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
    else:
        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@ -1146,11 +1192,11 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
-    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
    parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    args = parser.parse_args(args_in)
    if args.awq_path:
@ -1205,12 +1251,13 @@ def main(args_in: list[str] | None = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        # FIXME: Try to respect vocab_dir somehow?
-        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
+        vocab = VocabLoader(params, args.vocab_dir or args.model)
        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-                                          load_merges = args.vocabtype == 'bpe',
+                                          load_merges = True,
                                          n_vocab = vocab.vocab_size)
        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
                                    endianess = endianess, pad_vocab = args.padvocab)
        print(f"Wrote {outfile}")
        return
@ -1218,12 +1265,15 @@ def main(args_in: list[str] | None = None) -> None:
        vocab = model_plus.vocab
    else:
        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-        vocab = load_vocab(vocab_dir, args.vocabtype)
+        vocab = VocabLoader(params, vocab_dir)
    # FIXME: Try to respect vocab_dir somehow?
    print(f"Vocab info: {vocab}")
    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-                                      load_merges = args.vocabtype == 'bpe',
+                                      load_merges = True,
                                      n_vocab = vocab.vocab_size)
    print(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params)
    ftype   = pick_output_type(model, args.outtype)
@ -1233,7 +1283,8 @@ def main(args_in: list[str] | None = None) -> None:
    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
                         concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
    print(f"Wrote {outfile}")
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
 }
 static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(logits->n_dims == 2);
+    assert(ggml_is_matrix(logits));
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
-    assert(best_samples->n_dims == 1);
+    assert(ggml_is_vector(best_samples));
    assert(logits->ne[1] == best_samples->ne[0]);
    assert(logits->ne[0] == probs->ne[0]);
    assert(logits->ne[1] == probs->ne[1]);
@ -1292,9 +1292,9 @@ static void sample_softmax_batch(
    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
    struct ggml_tensor * best_samples
 ) {
-    GGML_ASSERT(best_samples->n_dims == 2);
+    GGML_ASSERT(ggml_is_matrix(best_samples));
-    GGML_ASSERT(logits->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(logits));
-    GGML_ASSERT(probs->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(probs));
    int n_tokens = best_samples->ne[0];
    int n_batch  = best_samples->ne[1];
    int n_vocab  = logits->ne[0];
@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }
 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
 static void get_example_targets_batch(
    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
 ) {
-    GGML_ASSERT(tokens_input->n_dims == 2);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(     targets->n_dims == 3);
+    GGML_ASSERT(ggml_is_3d(targets));
    int n_tokens = tokens_input->ne[0];
    int n_batch  = tokens_input->ne[1];
    GGML_ASSERT(n_tokens == targets->ne[1]);
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -129,13 +129,13 @@ int main(int argc, char ** argv)  {
    const ggml_type qtype = GGML_TYPE_Q4_1;
    size_t ctx_size = 0;
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
    ctx_size += 1024*1024*16;
    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }
 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
    int ct;
-    switch (gg_weights->n_dims){
+    switch (ggml_n_dims(gg_weights)) {
        case 1:
            ct = 0;
            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
        name = ggml_get_name(tensor);
    }
    uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
+    uint32_t nd = ggml_n_dims(tensor);
    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                       (uint32_t)tensor->ne[1],
                       (uint32_t)tensor->ne[2],
@ -1620,8 +1620,6 @@ int main(int argc, char ** argv) {
    opt->params.adam.gclip              = params.common.adam_gclip;
    opt->params.adam.eps_f              = params.common.adam_eps_f;
    ggml_allocr * alloc = NULL;
    printf("%s: init model\n", __func__);
    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
@ -1725,10 +1723,9 @@ int main(int argc, char ** argv) {
    // allocate input tensors
    mem_input_data.resize(max_input_size);
-    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
-    ggml_allocr_alloc(alloc, tokens_input);
+    ggml_allocr_alloc(alloc_inps, tokens_input);
-    ggml_allocr_alloc(alloc, target_probs);
+    ggml_allocr_alloc(alloc_inps, target_probs);
    ggml_allocr_free(alloc);
    // context for compute tensors without their data
    const size_t estimated_compute_size_wo_data = (
@ -1755,7 +1752,7 @@ int main(int argc, char ** argv) {
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
-        alloc = ggml_allocr_new_measure(tensor_alignment);
+        ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
        gf->order = (enum ggml_cgraph_eval_order) order;
        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1788,7 +1785,7 @@ int main(int argc, char ** argv) {
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
-    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
    gf->order = best_order;
    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@ -1804,6 +1801,8 @@ int main(int argc, char ** argv) {
        params.common.use_checkpointing
    );
    ggml_allocr_free(alloc);
    ggml_allocr_free(alloc_inps);
    // tokenize data
    std::vector<llama_token> train_tokens;
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
            // print first 10 elements
            const float * data = (const float *) cur->data;
--- a/examples/llama.swiftui/.gitignore
+++ b/examples/llama.swiftui/.gitignore
@ -1 +1,2 @@
 xcuserdata
 xcshareddata
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -6,16 +6,34 @@ enum LlamaError: Error {
    case couldNotInitializeContext
 }
 func llama_batch_clear(_ batch: inout llama_batch) {
    batch.n_tokens = 0
 }
 func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
    batch.token   [Int(batch.n_tokens)] = id
    batch.pos     [Int(batch.n_tokens)] = pos
    batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
    for i in 0..<seq_ids.count {
        batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
    }
    batch.logits  [Int(batch.n_tokens)] = logits ? 1 : 0
    batch.n_tokens += 1
 }
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]
    /// This variable is used to store temporarily invalid cchars
    private var temporary_invalid_cchars: [CChar]
-    var n_len: Int32 = 512
+    var n_len: Int32 = 64
    var n_cur: Int32 = 0
    var n_decode: Int32 = 0
    init(model: OpaquePointer, context: OpaquePointer) {
@ -27,25 +45,34 @@ actor LlamaContext {
    }
    deinit {
        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
        llama_backend_free()
    }
-    static func createContext(path: String) throws -> LlamaContext {
+    static func create_context(path: String) throws -> LlamaContext {
        llama_backend_init(false)
-        let model_params = llama_model_default_params()
+        var model_params = llama_model_default_params()
 #if targetEnvironment(simulator)
        model_params.n_gpu_layers = 0
        print("Running on simulator, force use n_gpu_layers = 0")
 #endif
        let model = llama_load_model_from_file(path, model_params)
        guard let model else {
            print("Could not load model at \(path)")
            throw LlamaError.couldNotInitializeContext
        }
        let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
        print("Using \(n_threads) threads")
        var ctx_params = llama_context_default_params()
-        ctx_params.seed = 1234
+        ctx_params.seed  = 1234
        ctx_params.n_ctx = 2048
-        ctx_params.n_threads = 8
+        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = 8
+        ctx_params.n_threads_batch = UInt32(n_threads)
        let context = llama_new_context_with_model(model, ctx_params)
        guard let context else {
@ -56,6 +83,26 @@ actor LlamaContext {
        return LlamaContext(model: model, context: context)
    }
    func model_info() -> String {
        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
        result.initialize(repeating: Int8(0), count: 256)
        defer {
            result.deallocate()
        }
        // TODO: this is probably very stupid way to get the string from C
        let nChars = llama_model_desc(model, result, 256)
        let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
        var SwiftString = ""
        for char in bufferPointer {
            SwiftString.append(Character(UnicodeScalar(UInt8(char))))
        }
        return SwiftString
    }
    func get_n_tokens() -> Int32 {
        return batch.n_tokens;
    }
@ -79,16 +126,11 @@ actor LlamaContext {
            print(String(cString: token_to_piece(token: id) + [0]))
        }
-        // batch = llama_batch_init(512, 0) // done in init()
+        llama_batch_clear(&batch)
        batch.n_tokens = Int32(tokens_list.count)
-        for i1 in 0..<batch.n_tokens {
+        for i1 in 0..<tokens_list.count {
            let i = Int(i1)
-            batch.token[i] = tokens_list[i]
+            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
            batch.pos[i] = i1
            batch.n_seq_id[Int(i)] = 1
            batch.seq_id[Int(i)]![0] = 0
            batch.logits[i] = 0
        }
        batch.logits[Int(batch.n_tokens) - 1] = 1 // true
@ -141,18 +183,11 @@ actor LlamaContext {
        print(new_token_str)
        // tokens_list.append(new_token_id)
-        batch.n_tokens = 0
+        llama_batch_clear(&batch)
-
+        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
        batch.token[Int(batch.n_tokens)] = new_token_id
        batch.pos[Int(batch.n_tokens)] = n_cur
        batch.n_seq_id[Int(batch.n_tokens)] = 1
        batch.seq_id[Int(batch.n_tokens)]![0] = 0
        batch.logits[Int(batch.n_tokens)] = 1 // true
        batch.n_tokens += 1
        n_decode += 1
-
+        n_cur    += 1
        n_cur += 1
        if llama_decode(context, batch) != 0 {
            print("failed to evaluate llama!")
@ -161,14 +196,111 @@ actor LlamaContext {
        return new_token_str
    }
    func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
        var pp_avg: Double = 0
        var tg_avg: Double = 0
        var pp_std: Double = 0
        var tg_std: Double = 0
        for _ in 0..<nr {
            // bench prompt processing
            llama_batch_clear(&batch)
            let n_tokens = pp
            for i in 0..<n_tokens {
                llama_batch_add(&batch, 0, Int32(i), [0], false)
            }
            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
            llama_kv_cache_clear(context)
            let t_pp_start = ggml_time_us()
            if llama_decode(context, batch) != 0 {
                print("llama_decode() failed during prompt")
            }
            let t_pp_end = ggml_time_us()
            // bench text generation
            llama_kv_cache_clear(context)
            let t_tg_start = ggml_time_us()
            for i in 0..<tg {
                llama_batch_clear(&batch)
                for j in 0..<pl {
                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
                }
                if llama_decode(context, batch) != 0 {
                    print("llama_decode() failed during text generation")
                }
            }
            let t_tg_end = ggml_time_us()
            llama_kv_cache_clear(context)
            let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
            let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
            let speed_pp = Double(pp)    / t_pp
            let speed_tg = Double(pl*tg) / t_tg
            pp_avg += speed_pp
            tg_avg += speed_tg
            pp_std += speed_pp * speed_pp
            tg_std += speed_tg * speed_tg
            print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
        }
        pp_avg /= Double(nr)
        tg_avg /= Double(nr)
        if nr > 1 {
            pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
            tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
        } else {
            pp_std = 0
            tg_std = 0
        }
        let model_desc     = model_info();
        let model_size     = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
        let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
        let backend        = "Metal";
        let pp_avg_str     = String(format: "%.2f", pp_avg);
        let tg_avg_str     = String(format: "%.2f", tg_avg);
        let pp_std_str     = String(format: "%.2f", pp_std);
        let tg_std_str     = String(format: "%.2f", tg_std);
        var result = ""
        result += String("| model | size | params | backend | test | t/s |\n")
        result += String("| --- | --- | --- | --- | --- | --- |\n")
        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
        return result;
    }
    func clear() {
        tokens_list.removeAll()
        temporary_invalid_cchars.removeAll()
        llama_kv_cache_clear(context)
    }
    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
        let utf8Count = text.utf8.count
-        let n_tokens = utf8Count + (add_bos ? 1 : 0)
+        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@ -1,481 +1,483 @@
 // !$*UTF8*$!
 {
-    archiveVersion = 1;
+	archiveVersion = 1;
-    classes = {
+	classes = {
-    };
+	};
-    objectVersion = 56;
+	objectVersion = 56;
-    objects = {
+	objects = {
 /* Begin PBXBuildFile section */
-        542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; };
+		542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-        5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; };
+		5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-        542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
+		542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
-        542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
+		542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
-        542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; };
+		542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-        542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
+		542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
-        549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
+		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
-        549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
+		549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 549479C52AC9E0F200E0F78B /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-fno-objc-arc -DGGML_SWIFT -DGGML_USE_METAL -O3"; }; };
-        8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
+		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
-        8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
+		8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
-        8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
+		8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
-        8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
+		8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
-        8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
+		8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */; };
-        8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
+		8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
-        8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
+		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
-        8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
 /* End PBXBuildFile section */
 /* Begin PBXFileReference section */
-        542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
+		542376062B0D9BEA008E6A1C /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../ggml-quants.h"; sourceTree = "<group>"; };
-        542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
+		542376072B0D9BFB008E6A1C /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../ggml-quants.c"; sourceTree = "<group>"; };
-        542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
+		542376092B0D9C40008E6A1C /* ggml-backend.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../ggml-backend.h"; sourceTree = "<group>"; };
-        5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
+		5423760A2B0D9C4B008E6A1C /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../ggml-backend.c"; sourceTree = "<group>"; };
-        542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
+		542EA09B2AC8723900A8AEE9 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../ggml.c; sourceTree = "<group>"; };
-        542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
+		542EA09C2AC8723900A8AEE9 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../ggml.h; sourceTree = "<group>"; };
-        542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
+		542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../ggml-alloc.h"; sourceTree = "<group>"; };
-        542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
+		542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../ggml-alloc.c"; sourceTree = "<group>"; };
-        542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
+		542EA0A12AC8729100A8AEE9 /* llama.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = llama.cpp; path = ../../llama.cpp; sourceTree = "<group>"; };
-        542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
+		542EA0A22AC8729100A8AEE9 /* llama.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = llama.h; path = ../../llama.h; sourceTree = "<group>"; };
-        549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
+		549479C52AC9E0F200E0F78B /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../ggml-metal.m"; sourceTree = "<group>"; };
-        549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
+		549479C62AC9E0F200E0F78B /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../ggml-metal.h"; sourceTree = "<group>"; };
-        549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
+		549479C82AC9E10B00E0F78B /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../ggml-metal.metal"; sourceTree = "<group>"; };
-        549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
+		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
-        8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
+		7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
-        8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "bridging-header.h"; sourceTree = "<group>"; };
-        8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
+		8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
-        8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+		8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
-        8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
-        8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
+		8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-        8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
-        8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */ = {isa = PBXFileReference; lastKnownFileType = file; path = "llama-2-7b-chat.Q2_K.gguf"; sourceTree = "<group>"; };
+		8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-        8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
+		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
-        8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
+		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
-        8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
+		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 /* Begin PBXFrameworksBuildPhase section */
-        8A1C83702AC328BD0096AF73 /* Frameworks */ = {
+		8A1C83702AC328BD0096AF73 /* Frameworks */ = {
-            isa = PBXFrameworksBuildPhase;
+			isa = PBXFrameworksBuildPhase;
-            buildActionMask = 2147483647;
+			buildActionMask = 2147483647;
-            files = (
+			files = (
-                549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
+				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
-                8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
+				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
-            );
+			);
-            runOnlyForDeploymentPostprocessing = 0;
+			runOnlyForDeploymentPostprocessing = 0;
-        };
+		};
 /* End PBXFrameworksBuildPhase section */
 /* Begin PBXGroup section */
-        8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
+		8A08D1F62AC7383900FE6CD4 /* llama.cpp */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
+				5423760A2B0D9C4B008E6A1C /* ggml-backend.c */,
-                542376092B0D9C40008E6A1C /* ggml-backend.h */,
+				542376092B0D9C40008E6A1C /* ggml-backend.h */,
-                542376062B0D9BEA008E6A1C /* ggml-quants.h */,
+				542376062B0D9BEA008E6A1C /* ggml-quants.h */,
-                542376072B0D9BFB008E6A1C /* ggml-quants.c */,
+				542376072B0D9BFB008E6A1C /* ggml-quants.c */,
-                549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
+				549479C82AC9E10B00E0F78B /* ggml-metal.metal */,
-                549479C62AC9E0F200E0F78B /* ggml-metal.h */,
+				549479C62AC9E0F200E0F78B /* ggml-metal.h */,
-                549479C52AC9E0F200E0F78B /* ggml-metal.m */,
+				549479C52AC9E0F200E0F78B /* ggml-metal.m */,
-                542EA09B2AC8723900A8AEE9 /* ggml.c */,
+				542EA09B2AC8723900A8AEE9 /* ggml.c */,
-                542EA09C2AC8723900A8AEE9 /* ggml.h */,
+				542EA09C2AC8723900A8AEE9 /* ggml.h */,
-                542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
+				542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */,
-                542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
+				542EA09E2AC8725700A8AEE9 /* ggml-alloc.h */,
-                542EA0A12AC8729100A8AEE9 /* llama.cpp */,
+				542EA0A12AC8729100A8AEE9 /* llama.cpp */,
-                542EA0A22AC8729100A8AEE9 /* llama.h */,
+				542EA0A22AC8729100A8AEE9 /* llama.h */,
-            );
+			);
-            name = llama.cpp;
+			name = llama.cpp;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A1C836A2AC328BD0096AF73 = {
+		8A1C836A2AC328BD0096AF73 = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
+				8A08D1F62AC7383900FE6CD4 /* llama.cpp */,
-                8A907F312AC7134E006146EA /* llama.cpp.swift */,
+				8A907F312AC7134E006146EA /* llama.cpp.swift */,
-                8A3F84232AC4C891005E2EE8 /* models */,
+				8A3F84232AC4C891005E2EE8 /* models */,
-                8A1C83752AC328BD0096AF73 /* llama.swiftui */,
+				8A1C83752AC328BD0096AF73 /* llama.swiftui */,
-                8A1C83742AC328BD0096AF73 /* Products */,
+				8A1C83742AC328BD0096AF73 /* Products */,
-                8A39BE082AC7601000BFEB40 /* Frameworks */,
+				8A39BE082AC7601000BFEB40 /* Frameworks */,
-            );
+			);
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A1C83742AC328BD0096AF73 /* Products */ = {
+		8A1C83742AC328BD0096AF73 /* Products */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
+				8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
-            );
+			);
-            name = Products;
+			name = Products;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
+		8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A3F84102AC4BD85005E2EE8 /* Resources */,
+				8A3F84102AC4BD85005E2EE8 /* Resources */,
-                8A9F7C4B2AC332DC008AE1EA /* Models */,
+				8A9F7C4B2AC332DC008AE1EA /* Models */,
-                8A9F7C4A2AC332BF008AE1EA /* UI */,
+				8A9F7C4A2AC332BF008AE1EA /* UI */,
-                8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
+				8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
-                8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
+				8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
-                8A1C837C2AC328BE0096AF73 /* Preview Content */,
+				8A1C837C2AC328BE0096AF73 /* Preview Content */,
-            );
+			);
-            path = llama.swiftui;
+			path = llama.swiftui;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
+		8A1C837C2AC328BE0096AF73 /* Preview Content */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
+				8A1C837D2AC328BE0096AF73 /* Preview Assets.xcassets */,
-            );
+			);
-            path = "Preview Content";
+			path = "Preview Content";
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A39BE082AC7601000BFEB40 /* Frameworks */ = {
+		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                549479CA2AC9E16000E0F78B /* Metal.framework */,
+				549479CA2AC9E16000E0F78B /* Metal.framework */,
-                8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
+				8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
-            );
+			);
-            name = Frameworks;
+			name = Frameworks;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A3F84102AC4BD85005E2EE8 /* Resources */ = {
+		8A3F84102AC4BD85005E2EE8 /* Resources */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A3F84112AC4BD8C005E2EE8 /* models */,
+				8A3F84112AC4BD8C005E2EE8 /* models */,
-            );
+			);
-            path = Resources;
+			path = Resources;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A3F84112AC4BD8C005E2EE8 /* models */ = {
+		8A3F84112AC4BD8C005E2EE8 /* models */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A3F841F2AC4C824005E2EE8 /* llama-2-7b-chat.Q2_K.gguf */,
+			);
-            );
+			path = models;
-            path = models;
+			sourceTree = "<group>";
-            sourceTree = "<group>";
+		};
-        };
+		8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
-        8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
+			isa = PBXGroup;
-            isa = PBXGroup;
+			children = (
-            children = (
+				8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
-                8A08D20A2AC73B1500FE6CD4 /* bridging-header.h */,
+				8A907F322AC7134E006146EA /* LibLlama.swift */,
-                8A907F322AC7134E006146EA /* LibLlama.swift */,
+			);
-            );
+			path = llama.cpp.swift;
-            path = llama.cpp.swift;
+			sourceTree = "<group>";
-            sourceTree = "<group>";
+		};
-        };
+		8A9F7C4A2AC332BF008AE1EA /* UI */ = {
-        8A9F7C4A2AC332BF008AE1EA /* UI */ = {
+			isa = PBXGroup;
-            isa = PBXGroup;
+			children = (
-            children = (
+				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
-                8A1C83782AC328BD0096AF73 /* ContentView.swift */,
+				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
-            );
+			);
-            path = UI;
+			path = UI;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
-        8A9F7C4B2AC332DC008AE1EA /* Models */ = {
+		8A9F7C4B2AC332DC008AE1EA /* Models */ = {
-            isa = PBXGroup;
+			isa = PBXGroup;
-            children = (
+			children = (
-                8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
+				8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
-            );
+			);
-            path = Models;
+			path = Models;
-            sourceTree = "<group>";
+			sourceTree = "<group>";
-        };
+		};
 /* End PBXGroup section */
 /* Begin PBXNativeTarget section */
-        8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
+		8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
-            isa = PBXNativeTarget;
+			isa = PBXNativeTarget;
-            buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
+			buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
-            buildPhases = (
+			buildPhases = (
-                8A1C836F2AC328BD0096AF73 /* Sources */,
+				8A1C836F2AC328BD0096AF73 /* Sources */,
-                8A1C83702AC328BD0096AF73 /* Frameworks */,
+				8A1C83702AC328BD0096AF73 /* Frameworks */,
-                8A1C83712AC328BD0096AF73 /* Resources */,
+				8A1C83712AC328BD0096AF73 /* Resources */,
-            );
+			);
-            buildRules = (
+			buildRules = (
-            );
+			);
-            dependencies = (
+			dependencies = (
-            );
+			);
-            name = llama.swiftui;
+			name = llama.swiftui;
-            packageProductDependencies = (
+			packageProductDependencies = (
-            );
+			);
-            productName = llama.swiftui;
+			productName = llama.swiftui;
-            productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
+			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
-            productType = "com.apple.product-type.application";
+			productType = "com.apple.product-type.application";
-        };
+		};
 /* End PBXNativeTarget section */
 /* Begin PBXProject section */
-        8A1C836B2AC328BD0096AF73 /* Project object */ = {
+		8A1C836B2AC328BD0096AF73 /* Project object */ = {
-            isa = PBXProject;
+			isa = PBXProject;
-            attributes = {
+			attributes = {
-                BuildIndependentTargetsInParallel = 1;
+				BuildIndependentTargetsInParallel = 1;
-                LastSwiftUpdateCheck = 1500;
+				LastSwiftUpdateCheck = 1500;
-                LastUpgradeCheck = 1500;
+				LastUpgradeCheck = 1500;
-                TargetAttributes = {
+				TargetAttributes = {
-                    8A1C83722AC328BD0096AF73 = {
+					8A1C83722AC328BD0096AF73 = {
-                        CreatedOnToolsVersion = 15.0;
+						CreatedOnToolsVersion = 15.0;
-                        LastSwiftMigration = 1500;
+						LastSwiftMigration = 1500;
-                    };
+					};
-                };
+				};
-            };
+			};
-            buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
+			buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
-            compatibilityVersion = "Xcode 14.0";
+			compatibilityVersion = "Xcode 14.0";
-            developmentRegion = en;
+			developmentRegion = en;
-            hasScannedForEncodings = 0;
+			hasScannedForEncodings = 0;
-            knownRegions = (
+			knownRegions = (
-                en,
+				en,
-                Base,
+				Base,
-            );
+			);
-            mainGroup = 8A1C836A2AC328BD0096AF73;
+			mainGroup = 8A1C836A2AC328BD0096AF73;
-            packageReferences = (
+			packageReferences = (
-            );
+			);
-            productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
+			productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
-            projectDirPath = "";
+			projectDirPath = "";
-            projectRoot = "";
+			projectRoot = "";
-            targets = (
+			targets = (
-                8A1C83722AC328BD0096AF73 /* llama.swiftui */,
+				8A1C83722AC328BD0096AF73 /* llama.swiftui */,
-            );
+			);
-        };
+		};
 /* End PBXProject section */
 /* Begin PBXResourcesBuildPhase section */
-        8A1C83712AC328BD0096AF73 /* Resources */ = {
+		8A1C83712AC328BD0096AF73 /* Resources */ = {
-            isa = PBXResourcesBuildPhase;
+			isa = PBXResourcesBuildPhase;
-            buildActionMask = 2147483647;
+			buildActionMask = 2147483647;
-            files = (
+			files = (
-                542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
+				542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
-                8A3F84242AC4C891005E2EE8 /* models in Resources */,
+				8A3F84242AC4C891005E2EE8 /* models in Resources */,
-                8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
+				8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
-                8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
+				8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
-            );
+			);
-            runOnlyForDeploymentPostprocessing = 0;
+			runOnlyForDeploymentPostprocessing = 0;
-        };
+		};
 /* End PBXResourcesBuildPhase section */
 /* Begin PBXSourcesBuildPhase section */
-        8A1C836F2AC328BD0096AF73 /* Sources */ = {
+		8A1C836F2AC328BD0096AF73 /* Sources */ = {
-            isa = PBXSourcesBuildPhase;
+			isa = PBXSourcesBuildPhase;
-            buildActionMask = 2147483647;
+			buildActionMask = 2147483647;
-            files = (
+			files = (
-                542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
+				542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
-                549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
+				549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
-                542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
+				542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
-                8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
+				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
-                542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
+				542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,
-                8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
+				8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
-                8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
+				8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
-                8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
+				8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
-                542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
+				7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
-                5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
+				542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */,
-            );
+				5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */,
-            runOnlyForDeploymentPostprocessing = 0;
+			);
-        };
+			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXSourcesBuildPhase section */
 /* Begin XCBuildConfiguration section */
-        8A1C837F2AC328BE0096AF73 /* Debug */ = {
+		8A1C837F2AC328BE0096AF73 /* Debug */ = {
-            isa = XCBuildConfiguration;
+			isa = XCBuildConfiguration;
-            buildSettings = {
+			buildSettings = {
-                ALWAYS_SEARCH_USER_PATHS = NO;
+				ALWAYS_SEARCH_USER_PATHS = NO;
-                ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-                CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NONNULL = YES;
-                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-                CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-                CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_MODULES = YES;
-                CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
-                CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
-                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-                CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
-                CLANG_WARN_COMMA = YES;
+				CLANG_WARN_COMMA = YES;
-                CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
-                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-                CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
-                CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
-                CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
-                CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
-                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-                CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
-                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-                CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
-                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-                COPY_PHASE_STRIP = NO;
+				COPY_PHASE_STRIP = NO;
-                DEBUG_INFORMATION_FORMAT = dwarf;
+				DEBUG_INFORMATION_FORMAT = dwarf;
-                ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
-                ENABLE_TESTABILITY = YES;
+				ENABLE_TESTABILITY = YES;
-                ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-                GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
-                GCC_DYNAMIC_NO_PIC = NO;
+				GCC_DYNAMIC_NO_PIC = NO;
-                GCC_NO_COMMON_BLOCKS = YES;
+				GCC_NO_COMMON_BLOCKS = YES;
-                GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_OPTIMIZATION_LEVEL = 0;
-                GCC_PREPROCESSOR_DEFINITIONS = (
+				GCC_PREPROCESSOR_DEFINITIONS = (
-                    "DEBUG=1",
+					"DEBUG=1",
-                    "$(inherited)",
+					"$(inherited)",
-                );
+				);
-                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-                GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
-                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-                GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_FUNCTION = YES;
-                GCC_WARN_UNUSED_VARIABLE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
-                IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-                LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-                MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-                MTL_FAST_MATH = YES;
+				MTL_FAST_MATH = YES;
-                ONLY_ACTIVE_ARCH = YES;
+				ONLY_ACTIVE_ARCH = YES;
-                SDKROOT = iphoneos;
+				SDKROOT = iphoneos;
-                SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
-                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-            };
+			};
-            name = Debug;
+			name = Debug;
-        };
+		};
-        8A1C83802AC328BE0096AF73 /* Release */ = {
+		8A1C83802AC328BE0096AF73 /* Release */ = {
-            isa = XCBuildConfiguration;
+			isa = XCBuildConfiguration;
-            buildSettings = {
+			buildSettings = {
-                ALWAYS_SEARCH_USER_PATHS = NO;
+				ALWAYS_SEARCH_USER_PATHS = NO;
-                ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
+				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-                CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NONNULL = YES;
-                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-                CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-                CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_MODULES = YES;
-                CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
-                CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
-                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-                CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
-                CLANG_WARN_COMMA = YES;
+				CLANG_WARN_COMMA = YES;
-                CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
-                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-                CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
-                CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
-                CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
-                CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
-                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-                CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
-                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-                CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
-                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-                COPY_PHASE_STRIP = NO;
+				COPY_PHASE_STRIP = NO;
-                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-                ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_NS_ASSERTIONS = NO;
-                ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
-                ENABLE_USER_SCRIPT_SANDBOXING = YES;
+				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-                GCC_C_LANGUAGE_STANDARD = gnu17;
+				GCC_C_LANGUAGE_STANDARD = gnu17;
-                GCC_NO_COMMON_BLOCKS = YES;
+				GCC_NO_COMMON_BLOCKS = YES;
-                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-                GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
-                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-                GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_FUNCTION = YES;
-                GCC_WARN_UNUSED_VARIABLE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
-                IPHONEOS_DEPLOYMENT_TARGET = 17.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-                LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
+				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-                MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_ENABLE_DEBUG_INFO = NO;
-                MTL_FAST_MATH = YES;
+				MTL_FAST_MATH = YES;
-                SDKROOT = iphoneos;
+				SDKROOT = iphoneos;
-                SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_COMPILATION_MODE = wholemodule;
-                VALIDATE_PRODUCT = YES;
+				VALIDATE_PRODUCT = YES;
-            };
+			};
-            name = Release;
+			name = Release;
-        };
+		};
-        8A1C83822AC328BE0096AF73 /* Debug */ = {
+		8A1C83822AC328BE0096AF73 /* Debug */ = {
-            isa = XCBuildConfiguration;
+			isa = XCBuildConfiguration;
-            buildSettings = {
+			buildSettings = {
-                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-                CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_MODULES = YES;
-                CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_STYLE = Automatic;
-                CURRENT_PROJECT_VERSION = 1;
+				CURRENT_PROJECT_VERSION = 1;
-                DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
+				DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
-                DEVELOPMENT_TEAM = STLSG3FG8Q;
+				DEVELOPMENT_TEAM = STLSG3FG8Q;
-                ENABLE_PREVIEWS = YES;
+				ENABLE_PREVIEWS = YES;
-                GENERATE_INFOPLIST_FILE = YES;
+				GENERATE_INFOPLIST_FILE = YES;
-                INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-                INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-                INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-                LD_RUNPATH_SEARCH_PATHS = (
+				LD_RUNPATH_SEARCH_PATHS = (
-                    "$(inherited)",
+					"$(inherited)",
-                    "@executable_path/Frameworks",
+					"@executable_path/Frameworks",
-                );
+				);
-                MARKETING_VERSION = 1.0;
+				MARKETING_VERSION = 1.0;
-                PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-                PRODUCT_NAME = "$(TARGET_NAME)";
+				PRODUCT_NAME = "$(TARGET_NAME)";
-                SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_EMIT_LOC_STRINGS = YES;
-                SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
+				SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
-                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-                SWIFT_VERSION = 5.0;
+				SWIFT_VERSION = 5.0;
-                TARGETED_DEVICE_FAMILY = "1,2";
+				TARGETED_DEVICE_FAMILY = "1,2";
-            };
+			};
-            name = Debug;
+			name = Debug;
-        };
+		};
-        8A1C83832AC328BE0096AF73 /* Release */ = {
+		8A1C83832AC328BE0096AF73 /* Release */ = {
-            isa = XCBuildConfiguration;
+			isa = XCBuildConfiguration;
-            buildSettings = {
+			buildSettings = {
-                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
-                CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_MODULES = YES;
-                CODE_SIGN_STYLE = Automatic;
+				CODE_SIGN_STYLE = Automatic;
-                CURRENT_PROJECT_VERSION = 1;
+				CURRENT_PROJECT_VERSION = 1;
-                DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
+				DEVELOPMENT_ASSET_PATHS = "\"llama.swiftui/Preview Content\"";
-                DEVELOPMENT_TEAM = STLSG3FG8Q;
+				DEVELOPMENT_TEAM = STLSG3FG8Q;
-                ENABLE_PREVIEWS = YES;
+				ENABLE_PREVIEWS = YES;
-                GENERATE_INFOPLIST_FILE = YES;
+				GENERATE_INFOPLIST_FILE = YES;
-                INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-                INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-                INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-                IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-                LD_RUNPATH_SEARCH_PATHS = (
+				LD_RUNPATH_SEARCH_PATHS = (
-                    "$(inherited)",
+					"$(inherited)",
-                    "@executable_path/Frameworks",
+					"@executable_path/Frameworks",
-                );
+				);
-                MARKETING_VERSION = 1.0;
+				MARKETING_VERSION = 1.0;
-                PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-                PRODUCT_NAME = "$(TARGET_NAME)";
+				PRODUCT_NAME = "$(TARGET_NAME)";
-                SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_EMIT_LOC_STRINGS = YES;
-                SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
+				SWIFT_OBJC_BRIDGING_HEADER = "llama.cpp.swift/bridging-header.h";
-                SWIFT_VERSION = 5.0;
+				SWIFT_VERSION = 5.0;
-                TARGETED_DEVICE_FAMILY = "1,2";
+				TARGETED_DEVICE_FAMILY = "1,2";
-            };
+			};
-            name = Release;
+			name = Release;
-        };
+		};
 /* End XCBuildConfiguration section */
 /* Begin XCConfigurationList section */
-        8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
+		8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
-            isa = XCConfigurationList;
+			isa = XCConfigurationList;
-            buildConfigurations = (
+			buildConfigurations = (
-                8A1C837F2AC328BE0096AF73 /* Debug */,
+				8A1C837F2AC328BE0096AF73 /* Debug */,
-                8A1C83802AC328BE0096AF73 /* Release */,
+				8A1C83802AC328BE0096AF73 /* Release */,
-            );
+			);
-            defaultConfigurationIsVisible = 0;
+			defaultConfigurationIsVisible = 0;
-            defaultConfigurationName = Release;
+			defaultConfigurationName = Release;
-        };
+		};
-        8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
+		8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
-            isa = XCConfigurationList;
+			isa = XCConfigurationList;
-            buildConfigurations = (
+			buildConfigurations = (
-                8A1C83822AC328BE0096AF73 /* Debug */,
+				8A1C83822AC328BE0096AF73 /* Debug */,
-                8A1C83832AC328BE0096AF73 /* Release */,
+				8A1C83832AC328BE0096AF73 /* Release */,
-            );
+			);
-            defaultConfigurationIsVisible = 0;
+			defaultConfigurationIsVisible = 0;
-            defaultConfigurationName = Release;
+			defaultConfigurationName = Release;
-        };
+		};
 /* End XCConfigurationList section */
-    };
+	};
-    rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
+	rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
 }
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@ -3,24 +3,26 @@ import Foundation
@MainActor
 class LlamaState: ObservableObject {
    @Published var messageLog = ""
    @Published var cacheCleared = false
    private var llamaContext: LlamaContext?
-    private var modelUrl: URL? {
+    private var defaultModelUrl: URL? {
-        Bundle.main.url(forResource: "q8_0", withExtension: "gguf", subdirectory: "models")
+        Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
        // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
    }
    init() {
        do {
-            try loadModel()
+            try loadModel(modelUrl: defaultModelUrl)
        } catch {
            messageLog += "Error!\n"
        }
    }
-    private func loadModel() throws {
+    func loadModel(modelUrl: URL?) throws {
        messageLog += "Loading model...\n"
        if let modelUrl {
-            llamaContext = try LlamaContext.createContext(path: modelUrl.path())
+            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
        } else {
            messageLog += "Could not locate model\n"
@ -31,7 +33,7 @@ class LlamaState: ObservableObject {
        guard let llamaContext else {
            return
        }
-        messageLog += "Attempting to complete text...\n"
+
        await llamaContext.completion_init(text: text)
        messageLog += "\(text)"
@ -42,4 +44,42 @@ class LlamaState: ObservableObject {
        await llamaContext.clear()
        messageLog += "\n\ndone\n"
    }
    func bench() async {
        guard let llamaContext else {
            return
        }
        messageLog += "\n"
        messageLog += "Running benchmark...\n"
        messageLog += "Model info: "
        messageLog += await llamaContext.model_info() + "\n"
        let t_start = DispatchTime.now().uptimeNanoseconds
        await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
        let t_end = DispatchTime.now().uptimeNanoseconds
        let t_heat = Double(t_end - t_start) / 1_000_000_000.0
        messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
        // if more than 5 seconds, then we're probably running on a slow device
        if t_heat > 5.0 {
            messageLog += "Heat up time is too long, aborting benchmark\n"
            return
        }
        let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
        messageLog += "\(result)"
        messageLog += "\n"
    }
    func clear() async {
        guard let llamaContext else {
            return
        }
        await llamaContext.clear()
        messageLog = ""
    }
 }
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@ -5,24 +5,132 @@ struct ContentView: View {
    @State private var multiLineText = ""
    private static func cleanupModelCaches() {
        // Delete all models (*.gguf)
        let fileManager = FileManager.default
        let documentsUrl =  FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
        do {
            let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
            for fileURL in fileURLs {
                if fileURL.pathExtension == "gguf" {
                    try fileManager.removeItem(at: fileURL)
                }
            }
        } catch {
            print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
        }
    }
    var body: some View {
        VStack {
-            ScrollView(.vertical) {
+            ScrollView(.vertical, showsIndicators: true) {
                Text(llamaState.messageLog)
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                .padding()
                .onTapGesture {
                    UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
                }
            }
            TextEditor(text: $multiLineText)
-                .frame(height: 200)
+                .frame(height: 80)
                .padding()
                .border(Color.gray, width: 0.5)
-            Button(action: {
+
-                sendText()
+            HStack {
-            }) {
+                Button("Send") {
-                Text("Send")
+                    sendText()
-                    .padding()
+                }
-                    .background(Color.blue)
+                .padding(8)
-                    .foregroundColor(.white)
+                .background(Color.blue)
-                    .cornerRadius(8)
+                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Bench") {
                    bench()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Clear") {
                    clear()
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
                Button("Copy") {
                    UIPasteboard.general.string = llamaState.messageLog
                }
                .padding(8)
                .background(Color.blue)
                .foregroundColor(.white)
                .cornerRadius(8)
            }
            VStack {
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
                )
                .font(.system(size: 12))
                .padding(.top, 4)
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
                    filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
                )
                .font(.system(size: 12))
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
                    filename: "tinyllama-1.1b-f16.gguf"
                )
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
                    filename: "phi-2-q4_0.gguf"
                )
                .font(.system(size: 12))
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
                    filename: "phi-2-q8_0.gguf"
                )
                .font(.system(size: 12))
                .frame(maxWidth: .infinity, alignment: .leading)
                DownloadButton(
                    llamaState: llamaState,
                    modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
                    modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
                    filename: "mistral-7b-v0.1.Q4_0.gguf"
                )
                .font(.system(size: 12))
                Button("Clear downloaded models") {
                    ContentView.cleanupModelCaches()
                    llamaState.cacheCleared = true
                }
                .padding(8)
                .font(.system(size: 12))
            }
        }
        .padding()
@ -34,9 +142,20 @@ struct ContentView: View {
            multiLineText = ""
        }
    }
    func bench() {
        Task {
            await llamaState.bench()
        }
    }
    func clear() {
        Task {
            await llamaState.clear()
        }
    }
 }
-/*
+
-#Preview {
+//#Preview {
-    ContentView()
+//    ContentView()
-}
+//}
 */
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
@ -0,0 +1,122 @@
 import SwiftUI
 struct DownloadButton: View {
    @ObservedObject private var llamaState: LlamaState
    private var modelName: String
    private var modelUrl: String
    private var filename: String
    @State private var status: String
    @State private var downloadTask: URLSessionDownloadTask?
    @State private var progress = 0.0
    @State private var observation: NSKeyValueObservation?
    private static func getFileURL(filename: String) -> URL {
        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
    }
    private func checkFileExistenceAndUpdateStatus() {
    }
    init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
        self.llamaState = llamaState
        self.modelName = modelName
        self.modelUrl = modelUrl
        self.filename = filename
        let fileURL = DownloadButton.getFileURL(filename: filename)
        status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
    }
    private func download() {
        status = "downloading"
        print("Downloading model \(modelName) from \(modelUrl)")
        guard let url = URL(string: modelUrl) else { return }
        let fileURL = DownloadButton.getFileURL(filename: filename)
        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
            if let error = error {
                print("Error: \(error.localizedDescription)")
                return
            }
            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
                print("Server error!")
                return
            }
            do {
                if let temporaryURL = temporaryURL {
                    try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
                    print("Writing to \(filename) completed")
                    llamaState.cacheCleared = false
                    status = "downloaded"
                }
            } catch let err {
                print("Error: \(err.localizedDescription)")
            }
        }
        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
            self.progress = progress.fractionCompleted
        }
        downloadTask?.resume()
    }
    var body: some View {
        VStack {
            if status == "download" {
                Button(action: download) {
                    Text("Download " + modelName)
                }
            } else if status == "downloading" {
                Button(action: {
                    downloadTask?.cancel()
                    status = "download"
                }) {
                    Text("\(modelName) (Downloading \(Int(progress * 100))%)")
                }
            } else if status == "downloaded" {
                Button(action: {
                    let fileURL = DownloadButton.getFileURL(filename: filename)
                    if !FileManager.default.fileExists(atPath: fileURL.path) {
                        download()
                        return
                    }
                    do {
                        try llamaState.loadModel(modelUrl: fileURL)
                    } catch let err {
                        print("Error: \(err.localizedDescription)")
                    }
                }) {
                    Text("\(modelName) (Downloaded)")
                }
            } else {
                Text("Unknown status")
            }
        }
        .onDisappear() {
            downloadTask?.cancel()
        }
        .onChange(of: llamaState.cacheCleared) { newValue in
            if newValue {
                downloadTask?.cancel()
                let fileURL = DownloadButton.getFileURL(filename: filename)
                status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
            }
        }
    }
 }
 // #Preview {
 //    DownloadButton(
 //        llamaState: LlamaState(),
 //        modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
 //        modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
 //        filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
 //    )
 // }
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            ctx_size += padded_size;
            if (verbosity >= 3) {
                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
-                       cur->n_dims, cur->name, tensor_size, padded_size, offset);
+                       ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
            }
        }
    }
@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
        temp->ny = longer_side;
        temp->size = 3 * longer_side * longer_side;
        temp->data = new uint8_t[temp->size]();
-        uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
+        uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
        // fill with background color
        for (size_t i = 0; i < temp->size; i++) {
@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
        }
        // quantize only 2D tensors
-        quantize &= (cur->n_dims == 2);
+        quantize &= (ggml_n_dims(cur) == 2);
        if (quantize) {
            new_type = type;
@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }
-        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@ -51,7 +51,7 @@ def bytes_to_unicode():
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@ -1,6 +1,6 @@
 # llama.cpp/examples/lookahead
-Demonstartion of lookahead decoding technique:
+Demonstration of lookahead decoding technique:
 https://lmsys.org/blog/2023-11-21-lookahead-decoding/
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -222,7 +222,7 @@ node index.js
    `content`: Set the text to process.
-    **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
+-   **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
    *Options:*
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
@ -11227,7 +11227,7 @@ class binary_reader
                }
                if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
                {
-                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimentional vector is not allowed", "size"), nullptr));
+                    return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
                }
                std::vector<size_t> dim;
                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) {
    headers: {
      'Connection': 'keep-alive',
      'Content-Type': 'application/json',
-      'Accept': 'text/event-stream'
+      'Accept': 'text/event-stream',
      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
    },
    signal: controller.signal,
  });
@ -114,7 +115,7 @@ export async function* llama(prompt, params = {}, config = {}) {
  return content;
 }
-// Call llama, return an event target that you can subcribe to
+// Call llama, return an event target that you can subscribe to
 //
 // Example:
 //
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -223,7 +223,7 @@
      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
      repeat_penalty: 1.18, // 1.0 = disabled
      top_k: 40, // <= 0 to use vocab size
-      top_p: 0.5, // 1.0 = disabled
+      top_p: 0.95, // 1.0 = disabled
      min_p: 0.05, // 0 = disabled
      tfs_z: 1.0, // 1.0 = disabled
      typical_p: 1.0, // 1.0 = disabled
@ -235,10 +235,11 @@
      grammar: '',
      n_probs: 0, // no completion_probabilities,
      image_data: [],
-      cache_prompt: true
+      cache_prompt: true,
      api_key: ''
    })
-    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
+    /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
    const local_storage_storageKey = "llamacpp_server_local_storage";
@ -282,7 +283,7 @@
    let importedTemplates = local_storage_getDataAsObject('user_templates')
    if (importedTemplates) {
-      // saved templates were successfuly imported.
+      // saved templates were successfully imported.
      console.log('Processing saved templates and updating default template')
      params.value = { ...params.value, image_data: [] };
@ -303,7 +304,7 @@
    }
    function userTemplateResetToDefault() {
-      console.log('Reseting themplate to default')
+      console.log('Resetting template to default')
      selectedUserTemplate.value.name = 'default';
      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
    }
@ -762,7 +763,7 @@
          <fieldset class="two">
            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
-            ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
@ -790,6 +791,10 @@
            <fieldset>
              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
            </fieldset>
            <fieldset>
              <label for="api_key">API Key</label>
              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
            </fieldset>
          </details>
        </form>
      `
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -10,7 +10,8 @@
 // crash the server in debug mode, otherwise send an http 500 error
 #define CPPHTTPLIB_NO_EXCEPTIONS 1
 #endif
-
+// increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 #include "httplib.h"
 #include "json.hpp"
@ -36,6 +37,7 @@ using json = nlohmann::json;
 struct server_params
 {
    std::string hostname = "127.0.0.1";
    std::string api_key;
    std::string public_path = "examples/server/public";
    int32_t port = 8080;
    int32_t read_timeout = 600;
@ -376,7 +378,6 @@ struct llama_client_slot
    int32_t num_prompt_tokens           = 0;
    int32_t num_prompt_tokens_processed = 0;
    int32_t multibyte_pending           = 0;
    json prompt;
    std::string generated_text;
@ -425,7 +426,6 @@ struct llama_client_slot
        stopped_word           = false;
        stopped_limit          = false;
        stopping_word          = "";
        multibyte_pending      = 0;
        n_past                 = 0;
        sent_count             = 0;
        sent_token_probs_index = 0;
@ -992,35 +992,36 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;
-        if (slot.multibyte_pending > 0)
+        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
        for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i)
        {
-            slot.multibyte_pending -= token_str.size();
+            unsigned char c = slot.generated_text[slot.generated_text.size() - i];
-        }
+            if ((c & 0xC0) == 0x80)
-        else if (token_str.size() == 1)
+            {
-        {
+                // continuation byte: 10xxxxxx
-            const char c = token_str[0];
+                continue;
-            // 2-byte characters: 110xxxxx 10xxxxxx
+            }
            if ((c & 0xE0) == 0xC0)
            {
-                slot.multibyte_pending = 1;
+                // 2-byte character: 110xxxxx ...
-                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                incomplete = i < 2;
            }
            else if ((c & 0xF0) == 0xE0)
            {
-                slot.multibyte_pending = 2;
+                // 3-byte character: 1110xxxx ...
-                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                incomplete = i < 3;
            }
            else if ((c & 0xF8) == 0xF0)
            {
-                slot.multibyte_pending = 3;
+                // 4-byte character: 11110xxx ...
-            }
+                incomplete = i < 4;
            else
            {
                slot.multibyte_pending = 0;
            }
            // else 1-byte character or invalid byte
            break;
        }
-        if (slot.multibyte_pending == 0)
+        if (!incomplete)
        {
            size_t pos = std::min(slot.sent_count, slot.generated_text.size());
            const std::string str_test = slot.generated_text.substr(pos);
@ -1055,7 +1056,7 @@ struct llama_server_context
            }
        }
-        if (slot.multibyte_pending > 0 && !slot.has_next_token)
+        if (incomplete)
        {
            slot.has_next_token = true;
        }
@ -1954,6 +1955,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
    printf("  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
    printf("  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
    printf("  --api-key API_KEY     optional api key to enhance server security. If set, requests must include this key for access.\n");
    printf("  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
    printf("  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
    printf("  -np N, --parallel N   number of slots for process requests (default: %d)\n", params.n_parallel);
@ -2003,6 +2005,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            sparams.public_path = argv[i];
        }
        else if (arg == "--api-key")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            sparams.api_key = argv[i];
        }
        else if (arg == "--timeout" || arg == "-to")
        {
            if (++i >= argc)
@ -2382,6 +2393,7 @@ json oaicompat_completion_params_parse(
    llama_params["__oaicompat"] = true;
    // Map OpenAI parameters to llama.cpp parameters
    llama_params["model"]             = json_value(body, "model", std::string("uknown"));
    llama_params["prompt"]            = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
    llama_params["cache_prompt"]      = json_value(body, "cache_prompt", false);
    llama_params["temperature"]       = json_value(body, "temperature", 0.8);
@ -2402,7 +2414,7 @@ json oaicompat_completion_params_parse(
    llama_params["ignore_eos"]        = json_value(body, "ignore_eos", false);
    llama_params["tfs_z"]             = json_value(body, "tfs_z", 0.0);
-    if (llama_params.count("grammar") != 0) {
+    if (body.count("grammar") != 0) {
        llama_params["grammar"] = json_value(body, "grammar", json::object());
    }
@ -2633,6 +2645,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
    log_disable();
 #endif
    // own arguments required by this example
    gpt_params params;
    server_params sparams;
@ -2669,6 +2684,32 @@ int main(int argc, char **argv)
    httplib::Server svr;
    // Middleware for API key validation
    auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool {
        // If API key is not set, skip validation
        if (sparams.api_key.empty()) {
            return true;
        }
        // Check for API key in the header
        auto auth_header = req.get_header_value("Authorization");
        std::string prefix = "Bearer ";
        if (auth_header.substr(0, prefix.size()) == prefix) {
            std::string received_api_key = auth_header.substr(prefix.size());
            if (received_api_key == sparams.api_key) {
                return true; // API key is valid
            }
        }
        // API key is invalid or not provided
        res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8");
        res.status = 401; // Unauthorized
        LOG_WARNING("Unauthorized: Invalid API Key", {});
        return false;
    };
    svr.set_default_headers({{"Server", "llama.cpp"},
                             {"Access-Control-Allow-Origin", "*"},
                             {"Access-Control-Allow-Headers", "content-type"}});
@ -2676,28 +2717,28 @@ int main(int argc, char **argv)
    // this is only called if no index.html is found in the public --path
    svr.Get("/", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
+                res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html; charset=utf-8");
                return false;
            });
    // this is only called if no index.js is found in the public --path
    svr.Get("/index.js", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
+                res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript; charset=utf-8");
                return false;
            });
    // this is only called if no index.html is found in the public --path
    svr.Get("/completion.js", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
+                res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript; charset=utf-8");
                return false;
            });
    // this is only called if no index.html is found in the public --path
    svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response &res)
            {
-                res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
+                res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8");
                return false;
            });
@ -2708,23 +2749,26 @@ int main(int argc, char **argv)
                    { "user_name",      llama.name_user.c_str() },
                    { "assistant_name", llama.name_assistant.c_str() }
                };
-                res.set_content(data.dump(), "application/json");
+                res.set_content(data.dump(), "application/json; charset=utf-8");
            });
-    svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
+    svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
                if (!validate_api_key(req, res)) {
                    return;
                }
                json data = json::parse(req.body);
                const int task_id = llama.request_completion(data, false, false, -1);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
                    if (!result.error && result.stop) {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
+                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
                    }
                    else
                    {
                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain");
+                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
                        return;
                    }
                } else {
@ -2795,12 +2839,15 @@ int main(int argc, char **argv)
                    }}
                };
-                res.set_content(models.dump(), "application/json");
+                res.set_content(models.dump(), "application/json; charset=utf-8");
            });
    // TODO: add mount point without "/v1" prefix -- how?
-    svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
+    svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
                if (!validate_api_key(req, res)) {
                    return;
                }
                json data = oaicompat_completion_params_parse(json::parse(req.body));
                const int task_id = llama.request_completion(data, false, false, -1);
@ -2814,10 +2861,10 @@ int main(int argc, char **argv)
                        res.set_content(oaicompat_result.dump(-1, ' ', false,
                                            json::error_handler_t::replace),
-                                            "application/json");
+                                            "application/json; charset=utf-8");
                    } else {
                        res.status = 500;
-                        res.set_content(result.result_json["content"], "text/plain");
+                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
                        return;
                    }
                } else {
@ -2869,8 +2916,11 @@ int main(int argc, char **argv)
                }
            });
-    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
+    svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
                if (!validate_api_key(req, res)) {
                    return;
                }
                json data = json::parse(req.body);
                const int task_id = llama.request_completion(data, true, false, -1);
                if (!json_value(data, "stream", false)) {
@ -2878,12 +2928,12 @@ int main(int argc, char **argv)
                    task_result result = llama.next_result(task_id);
                    if (!result.error && result.stop)
                    {
-                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
+                        res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
                    }
                    else
                    {
                        res.status = 404;
-                        res.set_content(result.result_json["content"], "text/plain");
+                        res.set_content(result.result_json["content"], "text/plain; charset=utf-8");
                        return;
                    }
                } else {
@ -2932,11 +2982,11 @@ int main(int argc, char **argv)
    svr.Get("/model.json", [&llama](const httplib::Request &, httplib::Response &res)
            {
                const json data = llama.get_model_props();
-                return res.set_content(data.dump(), "application/json");
+                return res.set_content(data.dump(), "application/json; charset=utf-8");
            });
    svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response &res)
-                { return res.set_content("", "application/json"); });
+                { return res.set_content("", "application/json; charset=utf-8"); });
    svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res)
            {
@ -2947,7 +2997,7 @@ int main(int argc, char **argv)
                    tokens = llama.tokenize(body["content"], false);
                }
                const json data = format_tokenizer_response(tokens);
-                return res.set_content(data.dump(), "application/json");
+                return res.set_content(data.dump(), "application/json; charset=utf-8");
            });
    svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res)
@ -2961,7 +3011,7 @@ int main(int argc, char **argv)
                }
                const json data = format_detokenized_response(content);
-                return res.set_content(data.dump(), "application/json");
+                return res.set_content(data.dump(), "application/json; charset=utf-8");
            });
    svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res)
@ -2978,7 +3028,7 @@ int main(int argc, char **argv)
                }
                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1);
                task_result result = llama.next_result(task_id);
-                return res.set_content(result.result_json.dump(), "application/json");
+                return res.set_content(result.result_json.dump(), "application/json; charset=utf-8");
            });
    svr.set_logger(log_server_request);
@ -2999,19 +3049,23 @@ int main(int argc, char **argv)
                {
                    snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
                }
-                res.set_content(buf, "text/plain");
+                res.set_content(buf, "text/plain; charset=utf-8");
                res.status = 500;
            });
    svr.set_error_handler([](const httplib::Request &, httplib::Response &res)
            {
                if (res.status == 401)
                {
                    res.set_content("Unauthorized", "text/plain; charset=utf-8");
                }
                if (res.status == 400)
                {
-                    res.set_content("Invalid request", "text/plain");
+                    res.set_content("Invalid request", "text/plain; charset=utf-8");
                }
-                else if (res.status != 500)
+                else if (res.status == 404)
                {
-                    res.set_content("File Not Found", "text/plain");
+                    res.set_content("File Not Found", "text/plain; charset=utf-8");
                    res.status = 404;
                }
            });
@ -3032,11 +3086,15 @@ int main(int argc, char **argv)
    // to make it ctrl+clickable:
    LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
-    LOG_INFO("HTTP server listening", {
+    std::unordered_map<std::string, std::string> log_data;
-                                          {"hostname", sparams.hostname},
+    log_data["hostname"] = sparams.hostname;
-                                          {"port", sparams.port},
+    log_data["port"] = std::to_string(sparams.port);
                                      });
    if (!sparams.api_key.empty()) {
        log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4);
    }
    LOG_INFO("HTTP server listening", log_data);
    // run the HTTP server in a thread - see comment below
    std::thread t([&]()
            {
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@ -1,6 +1,6 @@
 # llama.cpp/examples/speculative
-Demonstartion of speculative decoding and tree-based speculative decoding techniques
+Demonstration of speculative decoding and tree-based speculative decoding techniques
 More info:
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -428,7 +428,7 @@ int main(int argc, char ** argv) {
            ++n_past_tgt;
        }
-        // the first token is always proposed by the traget model before the speculation loop so we erase it here
+        // the first token is always proposed by the target model before the speculation loop so we erase it here
        for (int s = 0; s < n_seq_dft; ++s) {
            if (!drafts[s].active) {
                continue;
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -43,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
 // ggml-backend v2 API
 //
-// Seperate tensor and graph allocator objects
+// Separate tensor and graph allocator objects
 // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
 // The original API is kept as a wrapper around the new API
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -66,9 +66,11 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(div_row);
    GGML_METAL_DECL_KERNEL(scale);
    GGML_METAL_DECL_KERNEL(scale_4);
-    GGML_METAL_DECL_KERNEL(silu);
+    GGML_METAL_DECL_KERNEL(tanh);
    GGML_METAL_DECL_KERNEL(relu);
    GGML_METAL_DECL_KERNEL(gelu);
    GGML_METAL_DECL_KERNEL(gelu_quick);
    GGML_METAL_DECL_KERNEL(silu);
    GGML_METAL_DECL_KERNEL(soft_max);
    GGML_METAL_DECL_KERNEL(soft_max_4);
    GGML_METAL_DECL_KERNEL(diag_mask_inf);
@ -86,6 +88,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_q5_K);
    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(group_norm);
    GGML_METAL_DECL_KERNEL(norm);
    GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_f16_f16);
@ -102,6 +105,21 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
    //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
    GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
    //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_1row);
    //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32_l4);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q5_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q5_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q8_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q3_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@ -130,8 +148,11 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(rope_f16);
    GGML_METAL_DECL_KERNEL(alibi_f32);
    GGML_METAL_DECL_KERNEL(im2col_f16);
    GGML_METAL_DECL_KERNEL(upscale_f32);
    GGML_METAL_DECL_KERNEL(pad_f32);
    GGML_METAL_DECL_KERNEL(argsort_f32_i32_asc);
    GGML_METAL_DECL_KERNEL(argsort_f32_i32_desc);
    GGML_METAL_DECL_KERNEL(leaky_relu_f32);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
    GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
@ -140,6 +161,7 @@ struct ggml_metal_context {
    //GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
    //GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
    GGML_METAL_DECL_KERNEL(cpy_f16_f16);
    GGML_METAL_DECL_KERNEL(cpy_f16_f32);
    GGML_METAL_DECL_KERNEL(concat);
    GGML_METAL_DECL_KERNEL(sqr);
    GGML_METAL_DECL_KERNEL(sum_rows);
@ -177,6 +199,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
            ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
        } else {
            char* buffer2 = malloc(len+1);
            va_end(args);
            va_start(args, format);
            vsnprintf(buffer2, len+1, format, args);
            buffer2[len] = 0;
            ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
@ -316,9 +340,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(div_row);
        GGML_METAL_ADD_KERNEL(scale);
        GGML_METAL_ADD_KERNEL(scale_4);
-        GGML_METAL_ADD_KERNEL(silu);
+        GGML_METAL_ADD_KERNEL(tanh);
        GGML_METAL_ADD_KERNEL(relu);
        GGML_METAL_ADD_KERNEL(gelu);
        GGML_METAL_ADD_KERNEL(gelu_quick);
        GGML_METAL_ADD_KERNEL(silu);
        GGML_METAL_ADD_KERNEL(soft_max);
        GGML_METAL_ADD_KERNEL(soft_max_4);
        GGML_METAL_ADD_KERNEL(diag_mask_inf);
@ -336,6 +362,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_q5_K);
        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(group_norm);
        GGML_METAL_ADD_KERNEL(norm);
        GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_f16_f16);
@ -352,6 +379,21 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
        //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
        GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
        //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_1row);
        //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32_l4);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q5_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q5_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q8_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q2_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q3_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
        if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
@ -382,8 +424,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(rope_f16);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(im2col_f16);
        GGML_METAL_ADD_KERNEL(upscale_f32);
        GGML_METAL_ADD_KERNEL(pad_f32);
        GGML_METAL_ADD_KERNEL(argsort_f32_i32_asc);
        GGML_METAL_ADD_KERNEL(argsort_f32_i32_desc);
        GGML_METAL_ADD_KERNEL(leaky_relu_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
@ -392,6 +437,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        //GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
        //GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
        GGML_METAL_ADD_KERNEL(cpy_f16_f16);
        GGML_METAL_ADD_KERNEL(cpy_f16_f32);
        GGML_METAL_ADD_KERNEL(concat);
        GGML_METAL_ADD_KERNEL(sqr);
        GGML_METAL_ADD_KERNEL(sum_rows);
@ -416,9 +462,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(div_row);
    GGML_METAL_DEL_KERNEL(scale);
    GGML_METAL_DEL_KERNEL(scale_4);
-    GGML_METAL_DEL_KERNEL(silu);
+    GGML_METAL_DEL_KERNEL(tanh);
    GGML_METAL_DEL_KERNEL(relu);
    GGML_METAL_DEL_KERNEL(gelu);
    GGML_METAL_DEL_KERNEL(gelu_quick);
    GGML_METAL_DEL_KERNEL(silu);
    GGML_METAL_DEL_KERNEL(soft_max);
    GGML_METAL_DEL_KERNEL(soft_max_4);
    GGML_METAL_DEL_KERNEL(diag_mask_inf);
@ -436,6 +484,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(group_norm);
    GGML_METAL_DEL_KERNEL(norm);
    GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_f16_f16);
@ -452,6 +501,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
    //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
    GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
    //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_1row);
    //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32_l4);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q4_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q5_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q5_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q8_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q2_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q3_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
    if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
@ -482,8 +546,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(rope_f16);
    GGML_METAL_DEL_KERNEL(alibi_f32);
    GGML_METAL_DEL_KERNEL(im2col_f16);
    GGML_METAL_DEL_KERNEL(upscale_f32);
    GGML_METAL_DEL_KERNEL(pad_f32);
    GGML_METAL_DEL_KERNEL(argsort_f32_i32_asc);
    GGML_METAL_DEL_KERNEL(argsort_f32_i32_desc);
    GGML_METAL_DEL_KERNEL(leaky_relu_f32);
    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
    GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
@ -492,6 +559,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    //GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
    //GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
    GGML_METAL_DEL_KERNEL(cpy_f16_f32);
    GGML_METAL_DEL_KERNEL(concat);
    GGML_METAL_DEL_KERNEL(sqr);
    GGML_METAL_DEL_KERNEL(sum_rows);
@ -793,9 +861,11 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_UNARY:
            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_RELU:
                case GGML_UNARY_OP_GELU:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_SILU:
                    return true;
                default:
                    return false;
@ -807,6 +877,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
        case GGML_OP_PERMUTE:
        case GGML_OP_CONCAT:
        case GGML_OP_ADD:
        case GGML_OP_ACC:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_SCALE:
@ -814,21 +885,50 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
        case GGML_OP_SUM_ROWS:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_RMS_NORM:
        case GGML_OP_GROUP_NORM:
        case GGML_OP_NORM:
        case GGML_OP_ALIBI:
        case GGML_OP_ROPE:
        case GGML_OP_IM2COL:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_ARGSORT:
-        case GGML_OP_DUP:
+        case GGML_OP_LEAKY_RELU:
        case GGML_OP_CPY:
        case GGML_OP_CONT:
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
            return true;
        case GGML_OP_CPY:
        case GGML_OP_DUP:
        case GGML_OP_CONT:
            {
                switch (op->src[0]->type) {
                    case GGML_TYPE_F32:
                        switch (op->type) {
                           case GGML_TYPE_F16:
                           case GGML_TYPE_F32:
                           case GGML_TYPE_Q8_0:
                           case GGML_TYPE_Q4_0:
                           case GGML_TYPE_Q4_1:
                                return true;
                           default:
                                return false;
                        }
                    case GGML_TYPE_F16:
                        switch (op->type) {
                           case GGML_TYPE_F16:
                           case GGML_TYPE_F32:
                                return true;
                           default:
                                return false;
                        }
                    default:
                        return false;
                };
            }
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_GET_ROWS:
            {
-                return op->ne[0] % 4 == 0;
+                return op->ne[3] == 1;
            }
        default:
            return false;
@ -904,7 +1004,10 @@ void ggml_metal_graph_compute(
                        } break;
                }
-                GGML_ASSERT(ggml_metal_supports_op(dst));
+                if (!ggml_metal_supports_op(dst)) {
                    GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
                    GGML_ASSERT(!"unsupported op");
                }
                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
@ -1001,34 +1104,39 @@ void ggml_metal_graph_compute(
                    case GGML_OP_MUL:
                    case GGML_OP_DIV:
                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
+                            const size_t offs = 0;
                            GGML_ASSERT(ggml_is_contiguous(src1));
                            bool bcast_row = false;
                            int64_t nb = ne00;
-                            if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
+                            id<MTLComputePipelineState> pipeline = nil;
                            if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
                                GGML_ASSERT(ggml_is_contiguous(src0));
                                // src1 is a row
                                GGML_ASSERT(ne11 == 1);
                                nb = ne00 / 4;
                                switch (dst->op) {
-                                    case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add_row]; break;
+                                    case GGML_OP_ADD: pipeline = ctx->pipeline_add_row; break;
-                                    case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul_row]; break;
+                                    case GGML_OP_MUL: pipeline = ctx->pipeline_mul_row; break;
-                                    case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div_row]; break;
+                                    case GGML_OP_DIV: pipeline = ctx->pipeline_div_row; break;
                                    default: GGML_ASSERT(false);
                                }
                                bcast_row = true;
                            } else {
                                switch (dst->op) {
-                                    case GGML_OP_ADD: [encoder setComputePipelineState:ctx->pipeline_add]; break;
+                                    case GGML_OP_ADD: pipeline = ctx->pipeline_add; break;
-                                    case GGML_OP_MUL: [encoder setComputePipelineState:ctx->pipeline_mul]; break;
+                                    case GGML_OP_MUL: pipeline = ctx->pipeline_mul; break;
-                                    case GGML_OP_DIV: [encoder setComputePipelineState:ctx->pipeline_div]; break;
+                                    case GGML_OP_DIV: pipeline = ctx->pipeline_div; break;
                                    default: GGML_ASSERT(false);
                                }
                            }
                            [encoder setComputePipelineState:pipeline];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@ -1056,18 +1164,99 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
+                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
                            if (bcast_row) {
                                const int64_t n = ggml_nelements(dst)/4;
                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            } else {
-                                const int nth = MIN(1024, ne0);
+                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                            }
                        } break;
                    case GGML_OP_ACC:
                        {
                            GGML_ASSERT(src0t == GGML_TYPE_F32);
                            GGML_ASSERT(src1t == GGML_TYPE_F32);
                            GGML_ASSERT(dstt  == GGML_TYPE_F32);
                            GGML_ASSERT(ggml_is_contiguous(src0));
                            GGML_ASSERT(ggml_is_contiguous(src1));
                            const size_t pnb1 = ((int32_t *) dst->op_params)[0];
                            const size_t pnb2 = ((int32_t *) dst->op_params)[1];
                            const size_t pnb3 = ((int32_t *) dst->op_params)[2];
                            const size_t offs = ((int32_t *) dst->op_params)[3];
                            const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
                            if (!inplace) {
                                // run a separete kernel to cpy src->dst
                                // not sure how to avoid this
                                // TODO: make a simpler cpy_bytes kernel
                                const int nth = MIN(1024, ne00);
                                [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
                                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
                                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
                                [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
                                [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
                                [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
                                [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
                                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
                                [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
                                [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
                                [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
                                [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
                                [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
                                [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
                                [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
                                [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
                                [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
                                [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                            }
                            [encoder setComputePipelineState:ctx->pipeline_add];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
                            const int nth = MIN(1024, ne0);
                            [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_SCALE:
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));
@ -1091,16 +1280,15 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_UNARY:
                        switch (ggml_get_unary_op(gf->nodes[i])) {
-                            case GGML_UNARY_OP_SILU:
+                            case GGML_UNARY_OP_TANH:
                                {
-                                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                                    [encoder setComputePipelineState:ctx->pipeline_tanh];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                                    const int64_t n = ggml_nelements(dst);
                                    GGML_ASSERT(n % 4 == 0);
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            case GGML_UNARY_OP_RELU:
                                {
@ -1121,6 +1309,28 @@ void ggml_metal_graph_compute(
                                    const int64_t n = ggml_nelements(dst);
                                    GGML_ASSERT(n % 4 == 0);
                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            case GGML_UNARY_OP_GELU_QUICK:
                                {
                                    [encoder setComputePipelineState:ctx->pipeline_gelu_quick];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                                    const int64_t n = ggml_nelements(dst);
                                    GGML_ASSERT(n % 4 == 0);
                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            case GGML_UNARY_OP_SILU:
                                {
                                    [encoder setComputePipelineState:ctx->pipeline_silu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                                    const int64_t n = ggml_nelements(dst);
                                    GGML_ASSERT(n % 4 == 0);
                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            default:
@ -1193,7 +1403,11 @@ void ggml_metal_graph_compute(
                            const float scale = ((float *) dst->op_params)[0];
                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                            if (id_src1) {
                                [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
                            } else {
                                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
                            }
                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
                            [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
                            [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
@ -1444,7 +1658,7 @@ void ggml_metal_graph_compute(
                                else if (src0t == GGML_TYPE_Q6_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
-                                    int64_t ny = (ne11 + nrows - 1)/nrows;
+                                    const int64_t ny = (ne11 + nrows - 1)/nrows;
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                            }
@ -1456,7 +1670,7 @@ void ggml_metal_graph_compute(
                            GGML_ASSERT(src0t == GGML_TYPE_I32);
-                            const int n_as = ne00;
+                            const int n_as = ((int32_t *) dst->op_params)[1];
                            // TODO: make this more general
                            GGML_ASSERT(n_as <= 8);
@ -1488,14 +1702,22 @@ void ggml_metal_graph_compute(
                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                            // to the matrix-vector kernel
-                            int ne11_mm_min = 0;
+                            int ne11_mm_min = 1;
                            const int idx = ((int32_t *) dst->op_params)[0];
                            // batch size
                            GGML_ASSERT(ne01 == ne11);
                            const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                            // !!!
-                                ne11 > ne11_mm_min) {
+                            // TODO: for now, always use mat-vec kernels until we figure out how to improve the
                            //       indirect matrix multiplication
                            // !!!
                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
                                switch (src2->type) {
                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32];  break;
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32];  break;
@ -1514,19 +1736,22 @@ void ggml_metal_graph_compute(
                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                                [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:3];
+                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
-                                [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:4];
+                                [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                                [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:5];
+                                [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
-                                [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:6];
+                                [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
+                                [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
+                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
+                                [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
+                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
+                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
+                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
-                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
+                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
+                                [encoder setBytes:&_ne1    length:sizeof(_ne1) atIndex:14];
-                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:15];
+                                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
                                // TODO: how to make this an array? read Metal docs
                                for (int j = 0; j < n_as; ++j) {
                                    struct ggml_tensor * src_cur = dst->src[2 + j];
@ -1534,11 +1759,157 @@ void ggml_metal_graph_compute(
                                    size_t offs_src_cur = 0;
                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
-                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:16 + j];
+                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
                                }
                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne21 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+
                                // TODO: processing one row at a time (ne11 -> 1) is not efficient
                                [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
                                int nth0 = 32;
                                int nth1 = 1;
                                int nrows = 1;
                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                // use custom matrix x vector kernel
                                switch (src2t) {
                                    case GGML_TYPE_F32:
                                        {
                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f32_f32];
                                        } break;
                                    case GGML_TYPE_F16:
                                        {
                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
                                            nth0 = 32;
                                            nth1 = 1;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_f16_f32];
                                        } break;
                                    case GGML_TYPE_Q4_0:
                                        {
                                            nth0 = 8;
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_0_f32];
                                        } break;
                                    case GGML_TYPE_Q4_1:
                                        {
                                            nth0 = 8;
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_1_f32];
                                        } break;
                                    case GGML_TYPE_Q5_0:
                                        {
                                            nth0 = 8;
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_0_f32];
                                        } break;
                                    case GGML_TYPE_Q5_1:
                                        {
                                            nth0 = 8;
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_1_f32];
                                        } break;
                                    case GGML_TYPE_Q8_0:
                                        {
                                            nth0 = 8;
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q8_0_f32];
                                        } break;
                                    case GGML_TYPE_Q2_K:
                                        {
                                            nth0 = 2;
                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q2_K_f32];
                                        } break;
                                    case GGML_TYPE_Q3_K:
                                        {
                                            nth0 = 2;
                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q3_K_f32];
                                        } break;
                                    case GGML_TYPE_Q4_K:
                                        {
                                            nth0 = 4; //1;
                                            nth1 = 8; //32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
                                        {
                                            nth0 = 2;
                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q5_K_f32];
                                        } break;
                                    case GGML_TYPE_Q6_K:
                                        {
                                            nth0 = 2;
                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
                                        } break;
                                    default:
                                        {
                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
                                            GGML_ASSERT(false && "not implemented");
                                        }
                                };
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
                                [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
                                [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
                                [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
                                [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
                                [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
                                [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
                                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
                                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
                                [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
                                // TODO: how to make this an array? read Metal docs
                                for (int j = 0; j < n_as; ++j) {
                                    struct ggml_tensor * src_cur = dst->src[2 + j];
                                    size_t offs_src_cur = 0;
                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
                                }
                                if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
                                    src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
                                    src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src2t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src2t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                                }
                                else if (src2t == GGML_TYPE_Q5_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src2t == GGML_TYPE_Q6_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
                                    const int64_t ny = (_ne1 + nrows - 1)/nrows;
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                            }
                        } break;
                    case GGML_OP_GET_ROWS:
@ -1559,16 +1930,19 @@ void ggml_metal_graph_compute(
                                default: GGML_ASSERT(false && "not implemented");
                            }
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:5];
+                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
                            [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
                            [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
                            [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
-                            const int64_t n = ggml_nelements(src1);
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_RMS_NORM:
                        {
@ -1595,6 +1969,38 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_GROUP_NORM:
                        {
                            GGML_ASSERT(ne00 % 4 == 0);
                            //float eps;
                            //memcpy(&eps, dst->op_params, sizeof(float));
                            const float eps = 1e-6f; // TODO: temporarily hardcoded
                            const int32_t n_groups = ((int32_t *) dst->op_params)[0];
                            int nth = 32; // SIMD width
                            //while (nth < ne00/4 && nth < 1024) {
                            //    nth *= 2;
                            //}
                            [encoder setComputePipelineState:ctx->pipeline_group_norm];
                            [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
                            [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
                            [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
                            [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
                            [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
                            [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
                            [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
                            [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
                            [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
                            [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_NORM:
                        {
                            float eps;
@ -1764,6 +2170,65 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
                        } break;
                    case GGML_OP_UPSCALE:
                        {
                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
                            const int sf = dst->op_params[0];
                            [encoder setComputePipelineState:ctx->pipeline_upscale_f32];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
                            [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
                            const int nth = MIN(1024, ne0);
                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_PAD:
                        {
                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
                            [encoder setComputePipelineState:ctx->pipeline_pad_f32];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
                            const int nth = MIN(1024, ne0);
                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_ARGSORT:
                        {
                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
@ -1785,6 +2250,22 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
                        } break;
                    case GGML_OP_LEAKY_RELU:
                        {
                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
                            float slope;
                            memcpy(&slope, dst->op_params, sizeof(float));
                            [encoder setComputePipelineState:ctx->pipeline_leaky_relu_f32];
                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
                            [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
                            const int64_t n = ggml_nelements(dst);
                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
                    case GGML_OP_CONT:
@ -1813,7 +2294,7 @@ void ggml_metal_graph_compute(
                                    {
                                        switch (dstt) {
                                            case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
-                                            case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
+                                            case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f32]; break;
                                            default: GGML_ASSERT(false && "not implemented");
                                        };
                                    } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -3114,7 +3114,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
    size_t vl = __riscv_vsetvl_e8m1(qk/2);
-    // These tempory registers are for masking and shift operations
+    // These temporary registers are for masking and shift operations
    vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
    vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
@ -4757,7 +4757,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
            vl = 16;
-            // retreive lane to multiply with scale
+            // retrieve lane to multiply with scale
            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -215,9 +215,9 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 #define GGML_MAX_DIMS           4
-#define GGML_MAX_PARAMS         1024
+#define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
-#define GGML_MAX_SRC            6
+#define GGML_MAX_SRC            10
 #define GGML_MAX_NAME           64
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
@ -303,7 +303,7 @@ extern "C" {
 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@ -343,6 +343,12 @@ extern "C" {
        GGML_TYPE_COUNT,
    };
    // precision
    enum ggml_prec {
        GGML_PREC_DEFAULT,
        GGML_PREC_F32,
    };
    enum ggml_backend_type {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
@ -423,7 +429,9 @@ extern "C" {
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_ARGSORT,
        GGML_OP_LEAKY_RELU,
        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
@ -463,7 +471,6 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
        GGML_UNARY_OP_LEAKY,
        GGML_UNARY_OP_COUNT,
    };
@ -501,7 +508,6 @@ extern "C" {
        struct ggml_backend_buffer * buffer;
        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
                                   // nb[0] = ggml_type_size(type)
@ -533,7 +539,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[12];
+        char padding[8];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -638,11 +644,14 @@ extern "C" {
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
-    GGML_API int     ggml_blck_size (enum ggml_type type);
+    GGML_API int    ggml_blck_size(enum ggml_type type);
-    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+    GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
    GGML_DEPRECATED(
    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
    "use ggml_row_size() instead");
    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
@ -661,6 +670,11 @@ extern "C" {
    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@ -793,6 +807,9 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // dst = a
    // view(dst, nb1, nb2, nb3, offset) += b
    // return dst
    GGML_API struct ggml_tensor * ggml_acc(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -957,15 +974,14 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-    GGML_API struct ggml_tensor * ggml_leaky(
+    GGML_API struct ggml_tensor * ggml_leaky_relu(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a, float negative_slope, bool inplace);
    GGML_API struct ggml_tensor * ggml_relu_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // TODO: double-check this computation is correct
    GGML_API struct ggml_tensor * ggml_gelu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@ -1047,11 +1063,18 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // change the precision of a matrix multiplication
    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
    GGML_API void ggml_mul_mat_set_prec(
            struct ggml_tensor * a,
            enum ggml_prec       prec);
    // indirect matrix multiplication
    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
            struct ggml_context * ctx,
-            struct ggml_tensor  * as[],
+            struct ggml_tensor  * const as[],
            int                   n_as,
            struct ggml_tensor  * ids,
            int                   id,
            struct ggml_tensor  * b);
@ -1263,6 +1286,7 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // supports 3D: a->ne[2] == b->ne[1]
    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1549,6 +1573,15 @@ extern "C" {
            struct ggml_tensor  * a,
            int                   scale_factor);
    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
    GGML_API struct ggml_tensor * ggml_pad(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                  p0,
            int                  p1,
            int                  p2,
            int                  p3);
    // sort rows
    enum ggml_sort_order {
        GGML_SORT_ASC,
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw
 pip install build twine
 ```
-Then, folow these steps to release a new version:
+Then, follow these steps to release a new version:
 1. Bump the version in `pyproject.toml`.
 2. Build the package:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -38,6 +38,8 @@ class Keys:
        FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
        USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
        TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
        EXPERT_COUNT          = "{arch}.expert_count"
        EXPERT_USED_COUNT     = "{arch}.expert_used_count"
    class Attention:
        HEAD_COUNT        = "{arch}.attention.head_count"
@ -93,6 +95,7 @@ class MODEL_ARCH(IntEnum):
    BLOOM     = auto()
    STABLELM  = auto()
    QWEN      = auto()
    PHI2      = auto()
 class MODEL_TENSOR(IntEnum):
@ -111,11 +114,16 @@ class MODEL_TENSOR(IntEnum):
    ATTN_NORM       = auto()
    ATTN_NORM_2     = auto()
    ATTN_ROT_EMBD   = auto()
    FFN_GATE_INP    = auto()
    FFN_NORM        = auto()
    FFN_GATE        = auto()
    FFN_DOWN        = auto()
    FFN_UP          = auto()
    FFN_ACT         = auto()
    FFN_NORM        = auto()
    FFN_GATE_EXP    = auto()
    FFN_DOWN_EXP    = auto()
    FFN_UP_EXP      = auto()
    ATTN_Q_NORM     = auto()
    ATTN_K_NORM     = auto()
@ -135,6 +143,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.BLOOM:          "bloom",
    MODEL_ARCH.STABLELM:       "stablelm",
    MODEL_ARCH.QWEN:           "qwen",
    MODEL_ARCH.PHI2:           "phi2",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -155,11 +164,15 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
    MODEL_TENSOR.FFN_GATE_INP:    "blk.{bid}.ffn_gate_inp",
    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
    MODEL_TENSOR.FFN_ACT:         "blk.{bid}.ffn",
    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate.{xid}",
    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
 }
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -174,10 +187,14 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.ATTN_ROT_EMBD,
        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_GATE_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
    MODEL_ARCH.GPTNEOX: [
        MODEL_TENSOR.TOKEN_EMBD,
@ -339,6 +356,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
    MODEL_ARCH.GPT2: [
        # TODO
    ],
    MODEL_ARCH.PHI2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ]
    # TODO
 }
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -339,6 +339,12 @@ class GGUFWriter:
    def add_clamp_kqv(self, value: float) -> None:
        self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
    def add_expert_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
    def add_expert_used_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)
    def add_layer_norm_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -17,6 +17,7 @@ class TensorNameMap:
            "tok_embeddings",                            # llama-pth
            "embeddings.word_embeddings",                # bert
            "language_model.embedding.word_embeddings",  # persimmon
            "transformer.embd.wte",                      # phi2
        ),
        # Token type embeddings
@ -41,6 +42,7 @@ class TensorNameMap:
            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen
            "output",                    # llama-pth bloom
            "word_embeddings_for_head",  # persimmon
            "lm_head.linear",            # phi2
        ),
        # Output norm
@ -53,6 +55,7 @@ class TensorNameMap:
            "transformer.norm_f",                      # mpt
            "ln_f",                                    # refact bloom qwen
            "language_model.encoder.final_layernorm",  # persimmon
            "lm_head.ln",                              # phi2
        ),
        # Rope frequencies
@ -75,6 +78,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.LayerNorm",       # bert
            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
            "model.layers.{bid}.ln1",                               # yi
            "transformer.h.{bid}.ln",                               # phi2
        ),
        # Attention norm 2
@ -90,6 +94,7 @@ class TensorNameMap:
            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
            "h.{bid}.self_attention.query_key_value",                              # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
        ),
        # Attention query
@ -128,6 +133,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.dense",                # bert
            "transformer.h.{bid}.attn.out_proj",                         # gpt-j
            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
            "transformer.h.{bid}.mixer.out_proj",                        # phi2
        ),
        # Rotary embeddings
@ -149,6 +155,11 @@ class TensorNameMap:
            "model.layers.{bid}.ln2",                                        # yi
        ),
        MODEL_TENSOR.FFN_GATE_INP: (
            "layers.{bid}.feed_forward.gate",           # mixtral
            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
        ),
        # Feed-forward up
        MODEL_TENSOR.FFN_UP: (
            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
@ -162,6 +173,12 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
            "transformer.h.{bid}.mlp.w1",                             # qwen
            "transformer.h.{bid}.mlp.fc1",                            # phi2
        ),
        MODEL_TENSOR.FFN_UP_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w3",           # mixtral
            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
        ),
        # AWQ-activation gate
@ -171,9 +188,14 @@ class TensorNameMap:
        # Feed-forward gate
        MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",  # llama-hf refact
+            "model.layers.{bid}.mlp.gate_proj",           # llama-hf refact
-            "layers.{bid}.feed_forward.w1",      # llama-pth
+            "layers.{bid}.feed_forward.w1",               # llama-pth
-            "transformer.h.{bid}.mlp.w2",        # qwen
+            "transformer.h.{bid}.mlp.w2",                 # qwen
        ),
        MODEL_TENSOR.FFN_GATE_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w1",           # mixtral
            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral
        ),
        # Feed-forward down
@ -188,6 +210,12 @@ class TensorNameMap:
            "encoder.layer.{bid}.output.dense",                       # bert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
            "transformer.h.{bid}.mlp.fc2",                            # phi2
        ),
        MODEL_TENSOR.FFN_DOWN_EXP: (
            "layers.{bid}.feed_forward.experts.{xid}.w2",           # mixtral
            "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral
        ),
        MODEL_TENSOR.ATTN_Q_NORM: (
@ -218,11 +246,14 @@ class TensorNameMap:
            for tensor, keys in self.block_mappings_cfg.items():
                if tensor not in MODEL_TENSORS[arch]:
                    continue
-                tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
+                # TODO: make this configurable
-                self.mapping[tensor_name] = (tensor, tensor_name)
+                n_experts = 8
-                for key in keys:
+                for xid in range(n_experts):
-                    key = key.format(bid = bid)
+                    tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
-                    self.mapping[key] = (tensor, tensor_name)
+                    self.mapping[tensor_name] = (tensor, tensor_name)
                    for key in keys:
                        key = key.format(bid = bid, xid = xid)
                        self.mapping[key] = (tensor, tensor_name)
    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
        result = self.mapping.get(key)
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@ -109,8 +109,10 @@ class SpecialVocab:
        return True
    def _set_special_token(self, typ: str, tid: Any) -> None:
-        if not isinstance(tid, int) or tid < 0:
+        if not isinstance(tid, int):
            return
        if tid < 0:
            raise ValueError(f'invalid value for special token type {typ}: {tid}')
        if self.n_vocab is None or tid < self.n_vocab:
            if typ in self.special_token_ids:
                return
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.6.0"
+version = "0.7.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -39,6 +39,7 @@
 #define LLAMA_MAX_RNG_STATE (64*1024)
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
@ -217,7 +218,7 @@ extern "C" {
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool logits_all;  // the llama_eval() call computes all logits, not just the last one
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        bool embedding;   // embedding mode only
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
    };
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,5 @@
 numpy==1.24.4
 sentencepiece==0.1.98
 transformers>=4.34.0
 gguf>=0.1.0
 protobuf>=4.21.0
--- a/scripts/get-flags.mk
+++ b/scripts/get-flags.mk
@ -0,0 +1,38 @@
 ifeq '' '$(findstring clang,$(shell $(GF_CC) --version))'
 	GF_CC_IS_GCC = 1
 	GF_CC_VER := $(shell { $(GF_CC) -dumpfullversion 2>/dev/null || $(GF_CC) -dumpversion; } | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 else
 	GF_CC_IS_CLANG = 1
 	ifeq '' '$(findstring Apple,$(shell $(GF_CC) --version))'
 		GF_CC_IS_LLVM_CLANG = 1
 	else
 		GF_CC_IS_APPLE_CLANG = 1
 	endif
 	GF_CC_VER := \
 		$(shell $(GF_CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
 		| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 endif
 ifeq ($(GF_CC_IS_CLANG), 1)
 	# clang options
 	GF_CFLAGS   = -Wunreachable-code-break -Wunreachable-code-return
 	GF_CXXFLAGS = -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
 	ifneq '' '$(and $(GF_CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 030800)))'
 		GF_CFLAGS += -Wdouble-promotion
 	endif
 	ifneq '' '$(and $(GF_CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(GF_CC_VER) \>= 070300)))'
 		GF_CFLAGS += -Wdouble-promotion
 	endif
 else
 	# gcc options
 	GF_CFLAGS   = -Wdouble-promotion
 	GF_CXXFLAGS = -Wno-array-bounds
 	ifeq ($(shell expr $(GF_CC_VER) \>= 070100), 1)
 		GF_CXXFLAGS += -Wno-format-truncation
 	endif
 	ifeq ($(shell expr $(GF_CC_VER) \>= 080100), 1)
 		GF_CXXFLAGS += -Wextra-semi
 	endif
 endif
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -20,8 +20,6 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
    size_t size = ggml_nelements(tensor);
    std::vector<float> data(size);
    std::random_device rd;
 #if 0
    std::default_random_engine generator(rd());
    std::uniform_real_distribution<float> distribution(min, max);
@ -31,6 +29,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
    }
 #endif
    auto init_thread = [&](size_t start, size_t end) {
        std::random_device rd;
        std::default_random_engine generator(rd());
        std::uniform_real_distribution<float> distribution(min, max);
@ -51,11 +50,11 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
        t.join();
    }
-    if (tensor->type == GGML_TYPE_F32) {
+    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
-        std::vector<uint8_t> dataq(ggml_type_size(tensor->type)*size/ggml_blck_size(tensor->type));
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
        int64_t hist[16];
        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist);
        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
@ -71,23 +70,29 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
    std::vector<uint8_t> buf(ggml_nbytes(t));
    ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
    size_t bs = ggml_blck_size(t->type);
    std::vector<float> vq(ggml_blck_size(t->type));
    bool quantized = ggml_is_quantized(t->type);
    // access elements by index to avoid gaps in views
    for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
        for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
            for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
-                for (int64_t i0 = 0; i0 < t->ne[0]; i0++) {
+                for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
-                    size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0*t->nb[0];
+                    size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
                    float v;
                    if (t->type == GGML_TYPE_F16) {
-                        v = (float) ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]);
+                        tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
                    } else if (t->type == GGML_TYPE_F32) {
-                        v = *(float *) &buf[i];
+                        tv.push_back(*(float *) &buf[i]);
                    } else if (t->type == GGML_TYPE_I32) {
-                        v = *(int32_t *) &buf[i];
+                        tv.push_back((float)*(int32_t *) &buf[i]);
                    } else if (quantized) {
                        tt.to_float(&buf[i], vq.data(), bs);
                        tv.insert(tv.end(), vq.begin(), vq.end());
                    } else {
                        GGML_ASSERT(false);
                    }
                    tv.push_back(v);
                }
            }
        }
@ -230,9 +235,18 @@ static bool ggml_is_view_op(enum ggml_op op) {
    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
 }
 enum test_mode {
    MODE_TEST,
    MODE_PERF,
 };
 struct test_case {
    virtual ~test_case() {}
    virtual std::string op_desc(ggml_tensor * t) {
        return ggml_op_desc(t);
    }
    virtual std::string vars() {
        return "";
    }
@ -240,7 +254,7 @@ struct test_case {
    virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
    virtual double max_nmse_err() {
-        return 1e-6;
+        return 1e-7;
    }
    virtual void initialize_tensors(ggml_context * ctx) {
@ -260,7 +274,58 @@ struct test_case {
        return size;
    }
    ggml_cgraph * gf = nullptr;
    static const int sentinel_size = 1024;
    test_mode mode;
    std::vector<ggml_tensor *> sentinels;
    void add_sentinel(ggml_context * ctx) {
        if (mode == MODE_PERF) {
            return;
        }
        ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
        ggml_format_name(sentinel, "sent_%zu", sentinels.size());
        sentinels.push_back(sentinel);
    }
    // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
    ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
        ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
        add_sentinel(ctx);
        return t;
    }
    ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
        ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
        add_sentinel(ctx);
        return t;
    }
    ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
        ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
        add_sentinel(ctx);
        return t;
    }
    ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
        ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
        add_sentinel(ctx);
        return t;
    }
    ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
        ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
        add_sentinel(ctx);
        return t;
    }
    bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
        mode = MODE_TEST;
        ggml_init_params params = {
            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
            /* .mem_base = */ NULL,
@ -268,15 +333,20 @@ struct test_case {
        };
        ggml_context * ctx = ggml_init(params);
        gf = ggml_new_graph(ctx);
        // pre-graph sentinel
        add_sentinel(ctx);
        ggml_tensor * out = build_graph(ctx);
-        if (op_name != nullptr && strcmp(ggml_op_desc(out), op_name) != 0) {
+        if (op_name != nullptr && op_desc(out) != op_name) {
-            //printf("  %s: skipping\n", ggml_op_desc(out));
+            //printf("  %s: skipping\n", op_desc(out).c_str());
            ggml_free(ctx);
            return true;
        }
-        printf("  %s(%s): ", ggml_op_desc(out), vars().c_str());
+        printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
        fflush(stdout);
        // check if backends support op
@ -288,13 +358,20 @@ struct test_case {
            }
        }
        // post-graph sentinel
        add_sentinel(ctx);
        // allocate
        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
        // build graph
        ggml_cgraph * gf = ggml_new_graph(ctx);
        ggml_build_forward_expand(gf, out);
        // add sentinels as graph nodes so that they are checked in the callback
        for (ggml_tensor * sentinel : sentinels) {
            gf->nodes[gf->n_nodes++] = sentinel;
        }
        // randomize tensors
        initialize_tensors(ctx);
@ -310,14 +387,29 @@ struct test_case {
        };
        auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
            callback_userdata * ud = (callback_userdata *) user_data;
            if (t1->op == GGML_OP_NONE) {
                // sentinels must be unchanged
                std::vector<uint8_t> t1_data(ggml_nbytes(t1));
                std::vector<uint8_t> t2_data(ggml_nbytes(t2));
                ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
                ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
                if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
                    printf("sentinel mismatch: %s ", t1->name);
                    ud->ok = false;
                    return true;
                }
            }
            std::vector<float> f1 = tensor_to_float(t1);
            std::vector<float> f2 = tensor_to_float(t2);
            callback_userdata * ud = (callback_userdata *) user_data;
            for (size_t i = 0; i < f1.size(); i++) {
                // check for nans
                if (std::isnan(f1[i]) || std::isnan(f2[i])) {
-                    printf("NaN at index %zu ", i);
+                    printf("[%s] NaN at index %zu (%f %f) ", ggml_op_desc(t1), i, f1[i], f2[i]);
                    ud->ok = false;
                    return true;
                }
@ -325,12 +417,12 @@ struct test_case {
                if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
                    if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
                        if (std::signbit(f1[i]) != std::signbit(f2[i])) {
-                            printf("inf sign mismatch: %f %f ", f1[i], f2[i]);
+                            printf("[%s] inf sign mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
                            ud->ok = false;
                            return true;
                        }
                    } else {
-                        printf("inf mismatch: %f %f ", f1[i], f2[i]);
+                        printf("[%s] inf mismatch: %f %f ", ggml_op_desc(t1), f1[i], f2[i]);
                        ud->ok = false;
                        return true;
                    }
@ -339,10 +431,17 @@ struct test_case {
            double err = nmse(f1.data(), f2.data(), f1.size());
            if (err > ud->max_err) {
-                printf("NMSE = %f ", err);
+                printf("[%s] NMSE = %f ", ggml_op_desc(t1), err);
                //for (int i = 0; i < f1.size(); i++) {
                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
                //}
                //printf("\n");
                //exit(1);
                ud->ok = false;
            }
            return true;
            GGML_UNUSED(index);
        };
        ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
@ -361,6 +460,8 @@ struct test_case {
    }
    bool eval_perf(ggml_backend_t backend, const char * op_name) {
        mode = MODE_PERF;
        static const size_t graph_nodes = 8192;
        ggml_init_params params = {
@ -372,13 +473,13 @@ struct test_case {
        ggml_tensor * out = build_graph(ctx);
-        if (op_name != nullptr && strcmp(ggml_op_desc(out), op_name) != 0) {
+        if (op_name != nullptr && op_desc(out) != op_name) {
-            //printf("  %s: skipping\n", ggml_op_desc(out));
+            //printf("  %s: skipping\n", op_desc(out).c_str());
            ggml_free(ctx);
            return true;
        }
-        int len = printf("  %s(%s): ", ggml_op_desc(out), vars().c_str());
+        int len = printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
        fflush(stdout);
        // check if backends support op
@ -430,8 +531,9 @@ struct test_case {
            return size;
        };
        for (int i = 0; i < gf->n_nodes; i++) {
-            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out)
+            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
                continue;
            }
            mem += tensor_op_size(gf->nodes[i]);
        }
@ -486,17 +588,22 @@ struct test_get_rows : public test_case {
    const int n; // cols
    const int m; // rows
    const int r; // rows to get
    const int b; // batch size
    const bool v; // view (non-contiguous src1)
    std::string vars() override {
-        return VARS_TO_STR4(type, n, m, r);
+        return VARS_TO_STR6(type, n, m, r, b, v);
    }
-    test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3)
+    test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
-        : type(type), n(n), m(m), r(r) {}
+        : type(type), n(n), m(m), r(r), b(b), v(v) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * in = ggml_new_tensor_2d(ctx, type, n, m);
+        ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
-        ggml_tensor * rows = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, r);
+        ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
        if (v) {
            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
        }
        ggml_tensor * out = ggml_get_rows(ctx, in, rows);
        return out;
    }
@ -504,12 +611,13 @@ struct test_get_rows : public test_case {
    void initialize_tensors(ggml_context * ctx) override {
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
            if (t->type == GGML_TYPE_I32) {
                if (ggml_is_view_op(t->op)) { continue; }
                // rows
-                std::vector<int> data(r);
+                std::vector<int> data(r*b);
-                for (int i = 0; i < r; i++) {
+                for (int i = 0; i < r*b; i++) {
                    data[i] = rand() % m;
                }
-                ggml_backend_tensor_set(t, data.data(), 0, r * sizeof(int));
+                ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
            } else {
                init_tensor_uniform(t);
            }
@ -770,11 +878,10 @@ struct test_mul_mat_id : public test_case {
    const int64_t m;
    const int64_t n;
    const int64_t k;
-    const std::array<int64_t, 2> bs; // dims 3 and 4
+    const bool v; // view (non-contiguous ids)
    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
    std::string vars() override {
-        return VARS_TO_STR9(type_a, type_b, n_mats, id, m, n, k, bs, nr);
+        return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v);
    }
    double max_nmse_err() override {
@ -782,7 +889,7 @@ struct test_mul_mat_id : public test_case {
    }
    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[2]) * n * nr[0] * nr[1];
+        size_t a = ggml_nbytes(t->src[2]) * n;
        size_t b = ggml_nbytes(t->src[1]) * m;
        size_t c  = ggml_nbytes(t);
        return a + b + c;
@ -792,35 +899,41 @@ struct test_mul_mat_id : public test_case {
    test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
            int n_mats = 2, int id = 0,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32,
+            int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false)
            std::array<int64_t, 2> bs = {10, 10},
            std::array<int64_t, 2> nr = {2, 2})
        : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id),
-            m(m), n(n), k(k), bs(bs), nr(nr) {}
+            m(m), n(n), k(k), v(v) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
        std::vector<ggml_tensor *> mats;
        for (int i = 0; i < n_mats; i++) {
-            ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
+            ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m);
            mats.push_back(a);
        }
-        ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_mats);
+        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
+        if (v) {
-        ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), ids, id, b);
+            ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0);
        }
        ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n);
        ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b);
        return out;
    }
    void initialize_tensors(ggml_context * ctx) override {
        std::random_device rd;
        std::default_random_engine rng(rd());
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
            if (t->type == GGML_TYPE_I32) {
                if (ggml_is_view_op(t->op)) { continue; }
                // ids
-                std::vector<int> data(n_mats);
+                for (int64_t r = 0; r < ggml_nrows(t); r++) {
-                for (int i = 0; i < n_mats; i++) {
+                    std::vector<int32_t> data(t->ne[0]);
-                    data[i] = i;
+                    for (int i = 0; i < t->ne[0]; i++) {
                        data[i] = i % n_mats;
                    }
                    std::shuffle(data.begin(), data.end(), rng);
                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
                }
                std::shuffle(data.begin(), data.end(), std::default_random_engine(std::random_device()()));
                ggml_backend_tensor_set(t, data.data(), 0, n_mats * sizeof(int));
            } else {
                init_tensor_uniform(t);
            }
@ -1109,22 +1222,227 @@ struct test_sum_rows : public test_case {
    }
 };
-enum test_mode {
+// GGML_OP_UPSCALE
-    MODE_TEST,
+struct test_upscale : public test_case {
-    MODE_PERF,
+    const ggml_type type;
    const std::array<int64_t, 4> ne;
    const int32_t scale_factor;
    std::string vars() override {
        return VARS_TO_STR3(type, ne, scale_factor);
    }
    test_upscale(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {512, 512, 3, 1},
            int32_t scale_factor = 2)
        : type(type), ne(ne), scale_factor(scale_factor) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
        return out;
    }
 };
 // GGML_OP_GROUP_NORM
 struct test_group_norm : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    const int32_t num_groups;
    std::string vars() override {
        return VARS_TO_STR3(type, ne, num_groups);
    }
    test_group_norm(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {64, 64, 320, 1},
            int32_t num_groups = 32)
        : type(type), ne(ne), num_groups(num_groups) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
        return out;
    }
 };
 // GGML_OP_ACC
 struct test_acc : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne_a;
    const std::array<int64_t, 4> ne_b;
    std::string vars() override {
        return VARS_TO_STR3(type, ne_a, ne_b);
    }
    test_acc(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
            std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
        : type(type), ne_a(ne_a), ne_b(ne_b) {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
        ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
        return out;
    }
 };
 // GGML_OP_PAD
 struct test_pad : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne_a;
    const int pad_0;
    const int pad_1;
    std::string vars() override {
        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
    }
    test_pad(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
            int pad_0 = 1, int pad_1 = 1)
        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
        ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
        return out;
    }
 };
 // GGML_OP_LEAKY_RELU
 struct test_leaky_relu : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne_a;
    const float negative_slope;
    std::string vars() override {
        return VARS_TO_STR3(type, ne_a, negative_slope);
    }
    test_leaky_relu(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
            float negative_slope = 0.1f)
        : type(type), ne_a(ne_a), negative_slope(negative_slope)  {}
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
        ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
        return out;
    }
 };
 // Mixtral MOE
 struct test_moe : public test_case {
    const int n_experts;
    const int n_experts_per_tok;
    const int n_tokens;
    const int n_embd;
    const int n_ff;
    std::string op_desc(ggml_tensor * t) override {
        return "MOE";
        GGML_UNUSED(t);
    }
    std::string vars() override {
        return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff);
    }
    test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336)
        : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) {
    }
    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts);
        std::vector<ggml_tensor *> ffn_up_exp(n_experts);
        std::vector<ggml_tensor *> ffn_gate_exp(n_experts);
        std::vector<ggml_tensor *> ffn_down_exp(n_experts);
        for (int i = 0; i < n_experts; ++i) {
            ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
            ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
            ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
        }
        ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
        ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
        // select experts
        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
        ggml_tensor * weights = ggml_get_rows(ctx,
                ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts);
        weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens);
        ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
        weights = ggml_div(ctx, weights, weights_sum);
        // compute expert outputs
        ggml_tensor * moe_out = nullptr;
        for (int i = 0; i < n_experts_per_tok; ++i) {
            ggml_tensor * cur_expert;
            ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur);
            ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur);
            cur_gate = ggml_silu(ctx, cur_gate);
            cur_expert = ggml_mul(ctx, cur_up, cur_gate);
            cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert);
            cur_expert = ggml_mul(ctx, cur_expert,
                    ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
            if (i == 0) {
                moe_out = cur_expert;
            } else {
                moe_out = ggml_add(ctx, moe_out, cur_expert);
            }
        }
        cur = moe_out;
        return cur;
    }
 };
 static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
    std::vector<std::unique_ptr<test_case>> test_cases;
    const ggml_type all_types[] = {
        GGML_TYPE_F32, GGML_TYPE_F16,
        GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
        GGML_TYPE_Q8_0,
        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
        GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
        GGML_TYPE_Q6_K
    };
    // unary ops
    for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
        test_cases.emplace_back(new test_unary((ggml_unary_op) op));
    }
-    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+    test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
-        test_cases.emplace_back(new test_get_rows(type, 10, 5, 3));
+    for (ggml_type type : all_types) {
-        test_cases.emplace_back(new test_get_rows(type, 16, 5, 3));
+        for (int b : {1, 7}) {
            for (bool v : {false, true}) {
                test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
            }
        }
    }
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
@ -1134,7 +1452,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
    test_cases.emplace_back(new test_dup());
-    test_cases.emplace_back(new test_cpy());
+
    for (ggml_type type : all_types) {
       test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, type, {256, 10, 10, 1}));
    }
    test_cases.emplace_back(new test_cont());
    auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
@ -1144,6 +1466,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    };
    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
@ -1170,8 +1493,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
    add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
    add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
+    //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
+    //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
    test_cases.emplace_back(new test_scale());
@ -1180,16 +1503,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
    }
    const ggml_type all_types[] = {
        GGML_TYPE_F32, GGML_TYPE_F16,
        GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
        GGML_TYPE_Q8_0,
        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
        GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
        GGML_TYPE_Q6_K
    };
    for (ggml_type type_a : all_types) {
        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
            // FIXME: CPU crashes on f16xf16
@ -1213,9 +1526,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    for (ggml_type type_a : all_types) {
        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-            for (int n_mats : {1, 2, 4}) {
+            for (int n_mats : {2, 4, 8}) {
                for (int id = 0; id < n_mats; id++) {
-                    test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, {1, 1}, {1, 1}));
+                    for (bool v : {false, true}) {
                        test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v));
                    }
                }
            }
        }
@ -1240,6 +1555,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512)); // neox (falcon 40B)
        test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512)); // neox (falcon 40B)
        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512)); // neox (stablelm)
        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512)); // neox (phi-2)
    }
    test_cases.emplace_back(new test_alibi());
@ -1247,10 +1563,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_concat());
    for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) {
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
    }
    test_cases.emplace_back(new test_sum_rows());
    test_cases.emplace_back(new test_upscale());
    test_cases.emplace_back(new test_group_norm());
    test_cases.emplace_back(new test_acc());
    test_cases.emplace_back(new test_pad());
    test_cases.emplace_back(new test_leaky_relu());
 #if !defined(__SANITIZE_THREAD__)
    // FIXME: these tests use too much memory with thread sanitizer
    test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
    //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
 #endif
    // run tests
    if (mode == MODE_TEST) {
@ -1267,14 +1595,17 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        ggml_backend_free(backend_cpu);
        return n_ok == test_cases.size();
-    } else if (mode == MODE_PERF) {
+    }
    if (mode == MODE_PERF) {
        for (auto & test : test_cases) {
            test->eval_perf(backend, op_name);
        }
        return true;
    } else {
        GGML_ASSERT(false);
    }
    GGML_ASSERT(false);
    return false;
 }
 static void usage(char ** argv) {
@ -1347,11 +1678,12 @@ int main(int argc, char ** argv) {
    }
    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
    if (n_ok != ggml_backend_reg_get_count()) {
        printf("\033[1;31mFAIL\033[0m\n");
        return 1;
    } else {
        printf("\033[1;32mOK\033[0m\n");
        return 0;
    }
    printf("\033[1;32mOK\033[0m\n");
    return 0;
 }
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@ -1,4 +1,4 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #include "ggml.h"
 #include <cmath>
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -117,7 +117,7 @@ static void usage(char * argv[]) {
    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
-    printf("  --op OP               set test opration as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
+    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
    printf("  --type TYPE           set test type as");
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
@ -202,7 +202,7 @@ int main(int argc, char * argv[]) {
            }
            int alignment = std::stoi(argv[i]);
            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
-            fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
+            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
                invalid_param = true;
                break;
            }
@ -286,7 +286,7 @@ int main(int argc, char * argv[]) {
                        qfns.from_float_reference(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -300,7 +300,7 @@ int main(int argc, char * argv[]) {
                        qfns.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -315,7 +315,7 @@ int main(int argc, char * argv[]) {
                        qfns.to_float(test_q1, test_out, size);
                        return test_out[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -330,7 +330,7 @@ int main(int argc, char * argv[]) {
                        vdot.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");
@ -347,7 +347,7 @@ int main(int argc, char * argv[]) {
                        qfns.vec_dot(size, &result, test_q1, test_q2);
                        return result;
                    };
-                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    size_t quantized_size = ggml_row_size(type, size);
                    benchmark_function(size, quantized_size, iterations, quantize_fn);
                }
                printf("\n");