Merge branch 'master' into stablelm-support

2023-11-05 01:01:48 +01:00 · 2023-11-05 01:01:48 +01:00 · 8917767f56
commit 8917767f56
parent a00bb06c43 f28af0d81a
71 changed files with 7311 additions and 6369 deletions
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@ -1,7 +1,7 @@
 ---
 name: Bug template
 about: Used to report bugs in llama.cpp
-labels: ["bug"]
+labels: ["bug-unconfirmed"]
 assignees: ''
 ---
--- a/.gitignore
+++ b/.gitignore
@ -15,6 +15,7 @@
 .DS_Store
 .build/
 .cache/
 .ccls-cache/
 .direnv/
 .envrc
 .swiftpm
@ -64,7 +65,7 @@ models-mnt
 /parallel
 /train-text-from-scratch
 /vdot
-build-info.h
+/common/build-info.cpp
 arm_neon.h
 compile_commands.json
 CMakeSettings.json
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -44,7 +44,7 @@ endif()
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
 # debug
@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
@ -93,46 +94,12 @@ option(LLAMA_CLBLAST                         "llama: use CLBlast"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)
 #
 # Build info header
 #
 # Generate initial build-info.h
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
    # Is git submodule
    if(NOT IS_DIRECTORY "${GIT_DIR}")
        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
    endif()
    # Add a custom target for build-info.h
    add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
    # Add a custom command to rebuild build-info.h when .git/index changes
    add_custom_command(
        OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
        COMMENT "Generating build details from Git"
        COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        DEPENDS "${GIT_DIR}/index"
        VERBATIM
    )
 else()
    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
 endif()
 #
 # Compile flags
 #
@ -277,14 +244,9 @@ if (LLAMA_BLAS)
    endif()
 endif()
 if (LLAMA_K_QUANTS)
    set(GGML_HEADERS_EXTRA k_quants.h)
    set(GGML_SOURCES_EXTRA k_quants.c)
    add_compile_definitions(GGML_USE_K_QUANTS)
 if (LLAMA_QKK_64)
    add_compile_definitions(GGML_QKK_64)
 endif()
 endif()
 if (LLAMA_CUBLAS)
    cmake_minimum_required(VERSION 3.17)
@ -305,6 +267,9 @@ if (LLAMA_CUBLAS)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
        if (LLAMA_CUDA_FORCE_MMQ)
            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
        endif()
        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        if (DEFINED LLAMA_CUDA_DMMV_Y)
@ -331,6 +296,7 @@ if (LLAMA_CUBLAS)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
        else()
            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
            #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@ -404,6 +370,9 @@ if (LLAMA_HIPBLAS)
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
        endif()
        if (LLAMA_CUDA_FORCE_MMQ)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
        endif()
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@ -665,6 +634,8 @@ add_library(ggml OBJECT
            ggml-alloc.h
            ggml-backend.c
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
--- a/88
+++ b/88
@ -342,13 +342,9 @@ else
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
 ifndef LLAMA_NO_K_QUANTS
 	MK_CPPFLAGS += -DGGML_USE_K_QUANTS
 	OBJS     += k_quants.o
 ifdef LLAMA_QKK_64
 	MK_CPPFLAGS += -DGGML_QKK_64
 endif
 endif
 ifndef LLAMA_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
@ -397,6 +393,9 @@ endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
 	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_FORCE_MMQ
 	NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # LLAMA_CUDA_FORCE_MMQ
 ifdef LLAMA_CUDA_DMMV_X
 	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
@ -494,11 +493,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 ifndef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 # combine build flags with cmdline overrides
 override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
 override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
@ -539,13 +533,16 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-OBJS += ggml-alloc.o ggml-backend.o
+ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -566,46 +563,46 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 clean:
-	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
 #
 # Examples
 #
-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
-infill: examples/infill/infill.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.h ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
@ -617,7 +614,7 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
@ -626,19 +623,19 @@ llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 ifdef LLAMA_METAL
@ -651,7 +648,7 @@ swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
 endif
-build-info.h: $(wildcard .git/index) scripts/build-info.sh
+common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh $(CC) > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
 		mv $@.tmp $@; \
@ -659,13 +656,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
 		rm $@.tmp; \
 	fi
 build-info.o: common/build-info.cpp
 	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
 #
 # Tests
 #
 tests: $(TEST_TARGETS)
-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 run-benchmark-matmult: benchmark-matmult
@ -679,40 +679,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 tests/test-c.o: tests/test-c.c llama.h
--- a/Package.swift
+++ b/Package.swift
@ -42,13 +42,12 @@ let package = Package(
                "llama.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
-                "k_quants.c",
+                "ggml-quants.c",
            ] + additionalSources,
            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                .define("GGML_USE_K_QUANTS"),
                .define("GGML_USE_ACCELERATE")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
--- a/README.md
+++ b/README.md
@ -2,7 +2,6 @@
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
+- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912
 - ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
 ----
--- a/build.zig
+++ b/build.zig
@ -10,7 +10,6 @@ const Maker = struct {
    builder: *std.build.Builder,
    target: CrossTarget,
    optimize: Mode,
    config_header: *ConfigHeader,
    enable_lto: bool,
    include_dirs: ArrayList([]const u8),
@ -41,26 +40,24 @@ const Maker = struct {
        const commit_hash = try std.ChildProcess.exec(
            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
        );
-        const config_header = builder.addConfigHeader(
+        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
-            .{ .style = .blank, .include_path = "build-info.h" },
+            \\int LLAMA_BUILD_NUMBER = {};
-            .{
+            \\char const *LLAMA_COMMIT = "{s}";
-                .BUILD_NUMBER = 0,
+            \\char const *LLAMA_COMPILER = "Zig {s}";
-                .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
+            \\char const *LLAMA_BUILD_TARGET = "{s}";
-                .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
+            \\
-                .BUILD_TARGET = try target.allocDescription(builder.allocator),
+        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
            },
        );
        var m = Maker{
            .builder = builder,
            .target = target,
            .optimize = builder.standardOptimizeOption(.{}),
            .config_header = config_header,
            .enable_lto = false,
            .include_dirs = ArrayList([]const u8).init(builder.allocator),
            .cflags = ArrayList([]const u8).init(builder.allocator),
            .cxxflags = ArrayList([]const u8).init(builder.allocator),
            .objs = ArrayList(*Compile).init(builder.allocator),
        };
        try m.addCFlag("-std=c11");
        try m.addCxxFlag("-std=c++11");
        try m.addProjectInclude(&.{});
@ -72,7 +69,7 @@ const Maker = struct {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (o.target.getAbi() != .msvc)
            o.defineCMacro("_GNU_SOURCE", null);
-        o.addConfigHeader(m.config_header);
+
        if (std.mem.endsWith(u8, src, ".c")) {
            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
@ -85,7 +82,6 @@ const Maker = struct {
                o.linkLibCpp();
            }
        }
        o.addConfigHeader(m.config_header);
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
        o.want_lto = m.enable_lto;
        return o;
@ -105,7 +101,6 @@ const Maker = struct {
            // linkLibCpp already add (libc++ + libunwind + libc)
            e.linkLibCpp();
        }
        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
        e.want_lto = m.enable_lto;
        return e;
@ -116,16 +111,12 @@ pub fn build(b: *std.build.Builder) !void {
    var make = try Maker.init(b);
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
    if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
        try make.addFlag("-DGGML_USE_K_QUANTS");
        const k_quants = make.obj("k_quants", "k_quants.c");
        try make.objs.append(k_quants);
    }
    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
    const llama = make.obj("llama", "llama.cpp");
    const buildinfo = make.obj("common", "common/build-info.cpp");
    const common = make.obj("common", "common/common.cpp");
    const console = make.obj("console", "common/console.cpp");
    const sampling = make.obj("sampling", "common/sampling.cpp");
@ -133,14 +124,14 @@ pub fn build(b: *std.build.Builder) !void {
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -1,8 +1,46 @@
 # common
 # Build info header
 #
 if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
    # Is git submodule
    if(NOT IS_DIRECTORY "${GIT_DIR}")
        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
    endif()
    set(GIT_INDEX "${GIT_DIR}/index")
 else()
    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
    set(GIT_INDEX "")
 endif()
 # Add a custom command to rebuild build-info.cpp when .git/index changes
 add_custom_command(
    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
 )
 set(TARGET build_info)
 add_library(${TARGET} OBJECT build-info.cpp)
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 set(TARGET common)
-add_library(${TARGET} OBJECT
+add_library(${TARGET} STATIC
    common.h
    common.cpp
    sampling.h
@ -21,4 +59,4 @@ endif()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama)
+target_link_libraries(${TARGET} PRIVATE llama build_info)
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@ -0,0 +1,4 @@
 int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
 char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
 char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
 char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1,5 +1,4 @@
 #include "common.h"
 #include "build-info.h"
 #include "llama.h"
 #include <algorithm>
@ -103,9 +102,24 @@ void process_escapes(std::string& input) {
 }
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    bool result = true;
    try {
        if (!gpt_params_parse_ex(argc, argv, params)) {
            gpt_print_usage(argc, argv, gpt_params());
            exit(0);
        }
    }
    catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        gpt_print_usage(argc, argv, gpt_params());
        exit(1);
    }
    return result;
 }
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    bool invalid_param = false;
    std::string arg;
    gpt_params default_params;
    const std::string arg_prefix = "--";
    llama_sampling_params & sparams = params.sparams;
@ -204,12 +218,52 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
        } else if (arg == "--rope-scaling") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            std::string value(argv[i]);
            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
            else { invalid_param = true; break; }
        } else if (arg == "--rope-scale") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_scale = 1.0f/std::stof(argv[i]);
        } else if (arg == "--yarn-orig-ctx") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_orig_ctx = std::stoi(argv[i]);
        } else if (arg == "--yarn-ext-factor") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_ext_factor = std::stof(argv[i]);
        } else if (arg == "--yarn-attn-factor") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_attn_factor = std::stof(argv[i]);
        } else if (arg == "--yarn-beta-fast") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_beta_fast = std::stof(argv[i]);
        } else if (arg == "--yarn-beta-slow") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
        } else if (arg == "--top-p") {
@ -218,12 +272,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            sparams.top_p = std::stof(argv[i]);
        } else if (arg == "--min-p") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            sparams.min_p = std::stof(argv[i]);
        } else if (arg == "--temp") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            sparams.temp = std::stof(argv[i]);
            sparams.temp = std::max(sparams.temp, 0.0f);
        } else if (arg == "--tfs") {
            if (++i >= argc) {
                invalid_param = true;
@ -342,6 +403,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_sequences = std::stoi(argv[i]);
        } else if (arg == "--p-accept" || arg == "-pa") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.p_accept = std::stof(argv[i]);
        } else if (arg == "--p-split" || arg == "-ps") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.p_split = std::stof(argv[i]);
        } else if (arg == "-m" || arg == "--model") {
            if (++i >= argc) {
                invalid_param = true;
@ -547,11 +620,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, default_params);
+            return false;
-#ifndef LOG_DISABLE_LOGS
+
            log_print_usage();
 #endif // LOG_DISABLE_LOGS
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@ -610,22 +680,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            throw std::invalid_argument("error: unknown argument: " + arg);
            gpt_print_usage(argc, argv, default_params);
            exit(1);
        }
    }
    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
             params.instruct)) {
-        fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
+
-        gpt_print_usage(argc, argv, default_params);
+        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
        exit(1);
    }
    if (params.escape) {
@ -644,6 +709,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const llama_sampling_params & sparams = params.sparams;
    printf("\n");
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
    printf("options:\n");
@ -678,6 +744,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
@ -700,9 +767,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --cfg-negative-prompt-file FNAME\n");
    printf("                        negative prompt file to use for guidance. (default: empty)\n");
    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
-    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
+    printf("  --rope-scaling {none,linear,yarn}\n");
    printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
    printf("  --rope-scale N        RoPE context scaling factor, expands context by a factor of N\n");
    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
    printf("  --yarn-orig-ctx N     YaRN: original context size of model (default: 0 = model training context size)\n");
    printf("  --yarn-ext-factor N   YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@ -716,6 +790,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
    printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
@ -743,7 +819,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif // GGML_USE_CUBLAS
 #endif
    printf("  --verbose-prompt      print prompt before generation\n");
-    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
+    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
@ -754,6 +830,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
 #endif // LOG_DISABLE_LOGS
 }
 std::string get_system_info(const gpt_params & params) {
@ -816,8 +895,14 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.f16_kv            = params.memory_f16;
    cparams.logits_all        = params.logits_all;
    cparams.embedding         = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
    cparams.rope_freq_base    = params.rope_freq_base;
    cparams.rope_freq_scale   = params.rope_freq_scale;
    cparams.yarn_ext_factor   = params.yarn_ext_factor;
    cparams.yarn_attn_factor  = params.yarn_attn_factor;
    cparams.yarn_beta_fast    = params.yarn_beta_fast;
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    return cparams;
 }
@ -888,7 +973,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
-        llama_kv_cache_tokens_rm(lctx, -1, -1);
+        llama_kv_cache_clear(lctx);
        llama_reset_timings(lctx);
    }
@ -1127,8 +1212,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
    const llama_sampling_params & sparams = params.sparams;
-    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
+    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
-    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
+    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
@ -1274,6 +1359,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
--- a/common/common.h
+++ b/common/common.h
@ -9,6 +9,7 @@
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 #include <cmath>
 #include <string>
 #include <vector>
 #include <random>
@ -26,10 +27,16 @@
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
 #define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);         \
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);  \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const *LLAMA_COMMIT;
 extern char const *LLAMA_COMPILER;
 extern char const *LLAMA_BUILD_TARGET;
 //
 // CLI argument parsing
 //
@ -37,6 +44,7 @@ int32_t get_num_physical_cores();
 struct gpt_params {
    uint32_t seed                           = -1;    // RNG seed
    int32_t n_threads                       = get_num_physical_cores();
    int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_predict                       = -1;    // new tokens to predict
@ -47,6 +55,8 @@ struct gpt_params {
    int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel                      = 1;     // number of parallel sequences to decode
    int32_t n_sequences                     = 1;     // number of sequences to decode
    float   p_accept                        = 0.5f;  // speculative decoding accept probability
    float   p_split                         = 0.1f;  // speculative decoding split probability
    int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
@ -54,6 +64,13 @@ struct gpt_params {
    int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 0.0f;  // RoPE base frequency
    float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
    float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
    float   yarn_attn_factor                = 1.0f;  // YaRN magnitude scaling factor
    float   yarn_beta_fast                  = 32.0f; // YaRN low correction dim
    float   yarn_beta_slow                  = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx                   = 0;     // YaRN original context length
    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
                                                                              //       pinging @cebtenzzre
    // // sampling parameters
    struct llama_sampling_params sparams;
@ -110,6 +127,8 @@ struct gpt_params {
    std::string image = ""; // path to an image file
 };
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
--- a/common/log.h
+++ b/common/log.h
@ -97,6 +97,15 @@
    #define LOG_TEE_TARGET stderr
 #endif
 // Utility for synchronizing log configuration state
 //  since std::optional was introduced only in c++17
 enum LogTriState
 {
    LogTriStateSame,
    LogTriStateFalse,
    LogTriStateTrue
 };
 // Utility to obtain "pid" like unique process id and use it when creating log files.
 inline std::string log_get_pid()
 {
@ -118,16 +127,26 @@ inline std::string log_get_pid()
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
 //  where the number is a runtime id of the current thread.
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
+#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
 // INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
+inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
 {
    static bool _multilog = false;
    if (multilog != LogTriStateSame)
    {
        _multilog = multilog == LogTriStateTrue;
    }
    std::stringstream buf;
    buf << log_file_basename;
    if (_multilog)
    {
        buf << ".";
        buf << log_get_pid();
    }
    buf << ".";
    buf << log_file_extension;
@ -212,15 +231,6 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
    #define LOG_TEE_FLF_VAL ,""
 #endif
 // Utility for synchronizing log configuration state
 //  since std::optional was introduced only in c++17
 enum LogTriState
 {
    LogTriStateSame,
    LogTriStateFalse,
    LogTriStateTrue
 };
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
@ -314,16 +324,23 @@ enum LogTriState
 #endif
 // INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
+inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
 {
-    static bool _initialized{false};
+    static bool _initialized = false;
-    static bool _disabled{(filename.empty() && target == nullptr)};
+    static bool _append = false;
    static bool _disabled = filename.empty() && target == nullptr;
    static std::string log_current_filename{filename};
    static FILE *log_current_target{target};
    static FILE *logfile = nullptr;
    if (change)
    {
        if (append != LogTriStateSame)
        {
            _append = append == LogTriStateTrue;
            return logfile;
        }
        if (disable == LogTriStateTrue)
        {
            // Disable primary target
@ -376,7 +393,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
            }
        }
-        logfile = fopen(filename.c_str(), "w");
+        logfile = fopen(filename.c_str(), _append ? "a" : "w");
    }
    if (!logfile)
@ -397,9 +414,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
 }
 // INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
+inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
 {
-    return log_handler1_impl(change, disable, filename, target);
+    return log_handler1_impl(change, append, disable, filename, target);
 }
 // Disables logs entirely at runtime.
@ -410,7 +427,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTri
 // INTERNAL, DO NOT USE
 inline FILE *log_disable_impl()
 {
-    return log_handler1_impl(true, LogTriStateTrue);
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
 }
 // Enables logs at runtime.
@ -419,19 +436,31 @@ inline FILE *log_disable_impl()
 // INTERNAL, DO NOT USE
 inline FILE *log_enable_impl()
 {
-    return log_handler1_impl(true, LogTriStateFalse);
+    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
 }
 // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
 #define log_set_target(target) log_set_target_impl(target)
 // INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
+inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
+inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
 // INTERNAL, DO NOT USE
 inline FILE *log_handler() { return log_handler1_impl(); }
 // Enable or disable creating separate log files for each run.
 //  can ONLY be invoked BEFORE first log use.
 #define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
 // Enable or disable append mode for log file.
 //  can ONLY be invoked BEFORE first log use.
 #define log_append(enable) log_append_impl(enable)
 // INTERNAL, DO NOT USE
 inline FILE *log_append_impl(bool enable)
 {
    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
 }
 inline void log_test()
 {
    log_disable();
@ -493,6 +522,18 @@ inline bool log_param_single_parse(const std::string & param)
        return true;
    }
    if (param == "--log-new")
    {
        log_multilog(true);
        return true;
    }
    if (param == "--log-append")
    {
        log_append(true);
        return true;
    }
    return false;
 }
@ -522,7 +563,9 @@ inline void log_print_usage()
    printf("  --log-disable         Disable trace logs\n");
    printf("  --log-enable          Enable trace logs\n");
    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
+    printf("  --log-new             Create a separate new log file on start. "
                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
    printf("  --log-append          Don't truncate the old log file.\n");
 }
 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -39,6 +39,7 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
 void llama_sampling_reset(llama_sampling_context * ctx) {
    if (ctx->grammar != NULL) {
        llama_grammar_free(ctx->grammar);
        ctx->grammar = NULL;
    }
    if (!ctx->parsed_grammar.rules.empty()) {
@ -89,10 +90,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
    snprintf(result, sizeof(result),
            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
+            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
            params.mirostat, params.mirostat_eta, params.mirostat_tau);
    return std::string(result);
@ -110,6 +111,7 @@ llama_token llama_sampling_sample(
    const float   temp            = params.temp;
    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
    const float   top_p           = params.top_p;
    const float   min_p           = params.min_p;
    const float   tfs_z           = params.tfs_z;
    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
@ -167,8 +169,12 @@ llama_token llama_sampling_sample(
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }
-    if (temp <= 0) {
+    if (temp < 0.0) {
-        // greedy sampling
+        // greedy sampling, with probs
        llama_sample_softmax(ctx_main, &cur_p);
        id = cur_p.data[0].id;
    } else if (temp == 0.0) {
        // greedy sampling, no probs
        id = llama_sample_token_greedy(ctx_main, &cur_p);
    } else {
        if (mirostat == 1) {
@ -186,6 +192,7 @@ llama_token llama_sampling_sample(
            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
            llama_sample_temp     (ctx_main, &cur_p, temp);
            id = llama_sample_token(ctx_main, &cur_p);
--- a/common/sampling.h
+++ b/common/sampling.h
@ -14,6 +14,7 @@ typedef struct llama_sampling_params {
    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   min_p             = 0.05f; // 0.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
--- a/common/train.cpp
+++ b/common/train.cpp
@ -1045,6 +1045,7 @@ struct train_params_common get_default_train_params_common() {
    params.n_batch    =    8;
    params.n_gradient_accumulation = 1;
    params.n_epochs   = -1;
    params.n_gpu_layers = 0;
    params.custom_n_ctx = false;
@ -1080,6 +1081,7 @@ struct train_params_common get_default_train_params_common() {
    params.adam_beta2          = 0.999f;
    params.adam_gclip          = 1.0f;
    params.adam_eps_f          = 0.0f;
    return params;
 }
--- a/common/train.h
+++ b/common/train.h
@ -44,6 +44,7 @@ struct train_params_common {
    int n_batch;
    int n_gradient_accumulation;
    int n_epochs;
    int n_gpu_layers;
    bool custom_n_ctx;
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@ -163,7 +163,8 @@ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
    if "type" in hparams["rope_scaling"]:
        if hparams["rope_scaling"]["type"] == "linear":
-            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+            gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
            gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
 # TOKENIZATION
--- a/convert.py
+++ b/convert.py
@ -151,8 +151,11 @@ class Params:
    n_head_kv:  int
    f_norm_eps: float
    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
    f_rope_scale: float | None = None
    n_orig_ctx: int | None = None
    rope_finetuned: bool | None = None
    ftype: GGMLFileType | None = None
@ -198,20 +201,20 @@ class Params:
    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))
-        n_vocab          = config["vocab_size"]
+        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
        n_embd           = config["hidden_size"]
        n_layer          = config["num_hidden_layers"]
        n_ff             = config["intermediate_size"]
        n_head           = config["num_attention_heads"]
        n_head_kv        = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
        f_norm_eps       = config["rms_norm_eps"]
        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
        rope_scaling = config.get("rope_scaling")
-        if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
+
-            f_rope_scale = config["rope_scaling"].get("factor")
+        if rope_scaling is not None and (typ := rope_scaling.get("type")):
            rope_factor = rope_scaling.get("factor")
            f_rope_scale = rope_factor
            if typ == "linear":
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
                n_orig_ctx = rope_scaling['original_max_position_embeddings']
                rope_finetuned = rope_scaling['finetuned']
            else:
-            f_rope_scale = None
+                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
@ -222,16 +225,19 @@ class Params:
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
        return Params(
-            n_vocab          = n_vocab,
+            n_vocab           = config["vocab_size"],
-            n_embd           = n_embd,
+            n_embd            = config["hidden_size"],
-            n_layer          = n_layer,
+            n_layer           = config["num_hidden_layers"],
            n_ctx             = n_ctx,
-            n_ff             = n_ff,
+            n_ff              = config["intermediate_size"],
-            n_head           = n_head,
+            n_head            = (n_head := config["num_attention_heads"]),
-            n_head_kv        = n_head_kv,
+            n_head_kv         = config.get("num_key_value_heads", n_head),
-            f_norm_eps       = f_norm_eps,
+            f_norm_eps        = config["rms_norm_eps"],
-            f_rope_freq_base = f_rope_freq_base,
+            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
            f_rope_scale      = f_rope_scale,
            n_orig_ctx        = n_orig_ctx,
            rope_finetuned    = rope_finetuned,
        )
    # LLaMA v2 70B params.json
@ -240,17 +246,8 @@ class Params:
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))
        n_vocab          = config["vocab_size"] if "vocab_size" in config else -1
        n_embd           = config["dim"]
        n_layer          = config["n_layers"]
        n_ff             = -1
        n_head           = config["n_heads"]
        n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head
        f_norm_eps       = config["norm_eps"]
        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if f_rope_freq_base == 1000000:
+        if config.get("rope_theta") == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@ -260,22 +257,16 @@ class Params:
            # LLaMA v1
            n_ctx = 2048
        if n_vocab == -1:
            n_vocab = model["tok_embeddings.weight"].shape[0]
        if n_ff == -1:
            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
        return Params(
-            n_vocab          = n_vocab,
+            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
-            n_embd           = n_embd,
+            n_embd           = config["dim"],
-            n_layer          = n_layer,
+            n_layer          = config["n_layers"],
            n_ctx            = n_ctx,
-            n_ff             = n_ff,
+            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
-            n_head           = n_head,
+            n_head           = (n_head := config["n_heads"]),
-            n_head_kv        = n_head_kv,
+            n_head_kv        = config.get("n_kv_heads", n_head),
-            f_norm_eps       = f_norm_eps,
+            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = f_rope_freq_base,
+            f_rope_freq_base = config.get("rope_theta"),
        )
    @staticmethod
@ -366,15 +357,18 @@ class SentencePieceVocab:
            added_tokens = {}
        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids   = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
-        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
-        self.added_tokens_list = [text for (text, idx) in items]
+        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        self.vocab_size_base: int = vocab_size
+        actual_new_ids   = sorted(new_tokens.keys())
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+
        if expected_new_ids != actual_new_ids:
            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
        # Token pieces that were added to the base vocabulary.
        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
        self.vocab_size_base    = vocab_size
        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
        self.fname_tokenizer    = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens
@ -828,8 +822,16 @@ class OutputFile:
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
-        if params.f_rope_scale is not None:
+        if params.rope_scaling_type:
-            self.gguf.add_rope_scale_linear(params.f_rope_scale)
+            assert params.f_rope_scale is not None
            self.gguf.add_rope_scaling_type(params.rope_scaling_type)
            self.gguf.add_rope_scaling_factor(params.f_rope_scale)
        if params.n_orig_ctx is not None:
            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
        if params.rope_finetuned is not None:
            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
        if params.ftype is not None:
            self.gguf.add_file_type(params.ftype)
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
        }
    }
    LOG_TEE("\n");
    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
    LOG_TEE("\n");
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
@ -181,7 +185,7 @@ int main(int argc, char ** argv) {
                const auto t_pp_start = ggml_time_us();
-                llama_kv_cache_tokens_rm(ctx, -1, -1);
+                llama_kv_cache_clear(ctx);
                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                    LOG_TEE("%s: llama_decode() failed\n", __func__);
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
        return 1 ;
    }
@ -21,6 +21,9 @@ int main(int argc, char ** argv) {
    // total length of the sequences including the prompt
    int n_len = 32;
    // number of layers to offload to the GPU
    int n_gpu_layers = 0;
    if (argc >= 2) {
        params.model = argv[1];
    }
@ -37,6 +40,10 @@ int main(int argc, char ** argv) {
        n_len = std::atoi(argv[4]);
    }
    if (argc >= 6) {
        n_gpu_layers = std::atoi(argv[5]);
    }
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }
@ -49,7 +56,7 @@ int main(int argc, char ** argv) {
    llama_model_params model_params = llama_model_default_params();
-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+    model_params.n_gpu_layers = n_gpu_layers;
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@ -1,9 +1,6 @@
 set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -1,4 +1,3 @@
 #include "build-info.h"
 #include "common.h"
 #include "ggml.h"
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -1,4 +1,3 @@
 #include "build-info.h"
 #include "common.h"
 #include "llama.h"
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -642,8 +642,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        const int rope_mode = 0;
        return ggml_rope_custom(ctx,
-            t, KQ_pos, n_rot, rope_mode, n_ctx,
+            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
-            rope_freq_base, rope_freq_scale);
+            rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
        );
    };
    set_name(tokens_input, "tokens_input");
@ -652,7 +653,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type)) {
+        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
        } else if (a->type == GGML_TYPE_F32) {
            return ggml_add(ctx, a, b);
@ -1459,6 +1460,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
            }
            params->n_rank_w3 = std::stoi(argv[i]);
            params->custom_n_rank_w3 = true;
        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
            params->common.n_gpu_layers = std::stoi(argv[i]);
 #else
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            train_print_usage(argc, argv, &default_params);
@ -1545,6 +1557,7 @@ int main(int argc, char ** argv) {
    srand(params.common.seed);
    struct llama_model_params llama_mparams = llama_model_default_params();
    llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
    llama_mparams.vocab_only = false;
    printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@ -0,0 +1,34 @@
 #!/bin/bash
 cd `dirname $0`
 cd ../..
 EXE="./finetune"
 if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
 if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
 # MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
 MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
 while getopts "dg" opt; do
  case $opt in
    d)
      DEBUGGER="gdb --args"
      ;;
    g)
      EXE="./build/bin/Release/finetune"
      GPUARG="--gpu-layers 25"
      ;;
  esac
 done
 $DEBUGGER $EXE \
        --model-base $MODEL \
        $GPUARG \
        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
        --save-every 10 \
        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
        --use-checkpointing
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
    add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -2,7 +2,6 @@
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
 #include "grammar-parser.h"
 #include <cassert>
@ -184,8 +183,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }
-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
+    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -19,7 +19,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
 #include "build-info.h"
 #include "ggml-cuda.h"
 // utils
@ -641,8 +640,8 @@ struct test {
    }
 };
-const std::string test::build_commit = BUILD_COMMIT;
+const std::string test::build_commit = LLAMA_COMMIT;
-const int         test::build_number = BUILD_NUMBER;
+const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::metal        = !!ggml_cpu_has_metal();
@ -1037,7 +1036,7 @@ int main(int argc, char ** argv) {
        test t(inst, lmodel, ctx);
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
        // warmup run
        if (t.n_prompt > 0) {
@ -1048,7 +1047,7 @@ int main(int argc, char ** argv) {
        }
        for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_tokens_rm(ctx, -1, -1);
+            llama_kv_cache_clear(ctx);
            uint64_t t_start = get_time_ns();
            if (t.n_prompt > 0) {
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -6,15 +6,9 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
 if(TARGET BUILD_INFO)
    add_dependencies(${TARGET} BUILD_INFO)
 endif()
 set(TARGET llava)
 add_executable(${TARGET} llava.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
    add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@ -16,6 +16,8 @@ add_library(common OBJECT
    ${_common_path}/console.cpp
    ${_common_path}/grammar-parser.h
    ${_common_path}/grammar-parser.cpp
    ${_common_path}/sampling.h
    ${_common_path}/sampling.cpp
    )
 # WARNING: because build-info.h is auto-generated, it will only
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -208,6 +208,14 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
 Example usage: `--top-p 0.95`
 ### Min P Sampling
 -   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
 The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
 Example usage: `--min-p 0.05`
 ### Tail Free Sampling (TFS)
 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -2,7 +2,6 @@
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
 #include <cassert>
 #include <cinttypes>
@ -153,8 +152,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }
-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
+    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
@ -298,7 +297,7 @@ int main(int argc, char ** argv) {
        }
        // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
+        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
    }
    LOGLN(
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -1,8 +1,6 @@
 // A basic application simulating a server with multiple clients.
 // The clients submite requests to the server and they are processed in parallel.
 #include "build-info.h"
 #include "common.h"
 #include "llama.h"
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1,4 +1,3 @@
 #include "build-info.h"
 #include "common.h"
 #include "llama.h"
@ -210,7 +209,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@ -339,7 +338,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@ -573,7 +572,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }
        // clear the KV cache
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
        if (logits.empty()) {
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@ -1,6 +1,6 @@
 set(TARGET quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -1,5 +1,4 @@
 #define LLAMA_API_INTERNAL
 #include "build-info.h"
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@ -1,9 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -1,4 +1,3 @@
 #include "build-info.h"
 #include "common.h"
 #include "llama.h"
@ -18,7 +17,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
 #ifdef GGML_USE_K_QUANTS
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
@ -31,7 +29,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
 #endif
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
@ -70,13 +67,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 }
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
    printf("\nAllowed quantization types:\n");
    for (auto & it : QUANT_OPTIONS) {
        if (it.name != "COPY") {
@ -103,6 +101,8 @@ int main(int argc, char ** argv) {
            params.quantize_output_tensor = false;
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
            params.allow_requantize = true;
        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
            params.pure = true;
        } else {
            usage(argv[0]);
        }
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -1,4 +1,3 @@
 #include "build-info.h"
 #include "common.h"
 #include "llama.h"
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -11,6 +11,3 @@ if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1,6 +1,5 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
 #include "grammar-parser.h"
 #include "../llava/clip.h"
@ -149,6 +148,7 @@ struct task_server {
    task_type type;
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
 };
 struct task_result {
@ -371,6 +371,7 @@ struct llama_client_slot
    std::vector<completion_token_output> generated_token_probs;
    bool infill = false;
    bool embedding = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@ -454,7 +455,7 @@ struct llama_client_slot
    }
    void release() {
-        if (state == PROCESSING)
+        if (state == IDLE || state == PROCESSING)
        {
            t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
            command = RELEASE;
@ -754,6 +755,7 @@ struct llama_server_context
        }
        slot->params.antiprompt.clear();
        const auto &stop = data.find("stop");
        if (stop != data.end() && stop->is_array())
        {
@ -856,7 +858,7 @@ struct llama_server_context
    void kv_cache_clear() {
        // clear the entire KV cache
-        llama_kv_cache_tokens_rm(ctx, -1, -1);
+        llama_kv_cache_clear(ctx);
        clean_kv_cache = false;
    }
@ -867,7 +869,7 @@ struct llama_server_context
        kv_cache_clear();
-        for (int32_t i = 0; i < batch.n_tokens; ++i)
+        for (int i = 0; i < (int) system_tokens.size(); ++i)
        {
            llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
        }
@ -894,16 +896,8 @@ struct llama_server_context
        {
            slot.release();
        }
        wait_all_are_idle();
        all_slots_are_idle = true;
        // wait until system prompt load
        system_need_update = true;
        while (system_need_update)
        {
            std::this_thread::sleep_for(std::chrono::milliseconds(5));
        }
        // system prompt loaded, continue
    }
    void process_system_prompt_data(const json &sys_props) {
@ -915,26 +909,6 @@ struct llama_server_context
        {
            notify_system_prompt_changed();
        }
        else
        {
            system_need_update = true;
        }
    }
    void wait_all_are_idle() {
        bool wait = true;
        while (wait)
        {
            wait = false;
            for (auto &slot : slots)
            {
                if (!slot.available())
                {
                    wait = true;
                    break;
                }
            }
        }
    }
    static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
@ -965,7 +939,6 @@ struct llama_server_context
                    slot.has_next_token = false;
                }
                stop_pos = pos;
            }
        }
@ -1272,13 +1245,14 @@ struct llama_server_context
        queue_results.push_back(res);
    }
-    int request_completion(json data, bool infill)
+    int request_completion(json data, bool infill, bool embedding)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
        task.data = data;
        task.infill_mode = infill;
        task.embedding_mode = embedding;
        task.type = COMPLETION_TASK;
        queue_tasks.push_back(task);
        return task.id;
@ -1404,7 +1378,7 @@ struct llama_server_context
                    {
                        LOG_TEE("slot unavailable\n");
                        // send error result
-                        send_error(task.id, "slot unavaliable");
+                        send_error(task.id, "slot unavailable");
                        return;
                    }
@ -1416,6 +1390,7 @@ struct llama_server_context
                    slot->reset();
                    slot->infill = task.infill_mode;
                    slot->embedding = task.embedding_mode;
                    slot->task_id = task.id;
                    if (!launch_slot_with_data(slot, task.data))
@ -1444,7 +1419,7 @@ struct llama_server_context
        process_tasks();
        // update the system prompt wait until all slots are idle state
-        if (system_need_update)
+        if (system_need_update && all_slots_are_idle)
        {
            LOG_TEE("updating system prompt\n");
            update_system_prompt();
@ -1498,7 +1473,7 @@ struct llama_server_context
        for (auto & slot : slots)
        {
            // release the slot
-            if (slot.state == PROCESSING && slot.command == RELEASE)
+            if (slot.command == RELEASE)
            {
                slot.state = IDLE;
                slot.command = NONE;
@ -1509,7 +1484,7 @@ struct llama_server_context
                continue;
            }
-            if (slot.state == IDLE || slot.command == RELEASE)
+            if (slot.state == IDLE)
            {
                continue;
            }
@ -1530,6 +1505,17 @@ struct llama_server_context
        {
            for (auto & slot : slots)
            {
                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
                // empty prompt passed -> release the slot and send empty response
                if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
                {
                    slot.release();
                    slot.print_timings();
                    send_final_response(slot);
                    continue;
                }
                // need process the prompt
                if (slot.state == IDLE && slot.command == LOAD_PROMPT)
                {
@ -1712,7 +1698,7 @@ struct llama_server_context
                }
                // prompt evaluated for embedding
-                if (params.embedding)
+                if (slot.embedding)
                {
                    send_embedding(slot);
                    slot.release();
@ -1749,8 +1735,8 @@ struct llama_server_context
                if (!process_token(result, slot))
                {
                    slot.release();
                    send_final_response(slot);
                    slot.print_timings();
                    send_final_response(slot);
                }
                slot.i_batch = -1;
@ -1769,9 +1755,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  -h, --help                show this help message and exit\n");
    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
    printf("  --rope-scaling {none,linear,yarn}\n");
    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  --rope-freq-scale N       RoPE frequency scaling factor, expands context by a factor of 1/N\n");
    printf("  --yarn-ext-factor N       YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
    printf("  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
    printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
    printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
@ -1893,6 +1886,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
        else if (arg == "--rope-scaling")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            std::string value(argv[i]);
            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
            else { invalid_param = true; break; }
        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc)
@ -1911,6 +1917,38 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.rope_freq_scale = std::stof(argv[i]);
        }
        else if (arg == "--yarn-ext-factor")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_ext_factor = std::stof(argv[i]);
        }
        else if (arg == "--yarn-attn-factor")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_attn_factor = std::stof(argv[i]);
        }
        else if (arg == "--yarn-beta-fast")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_beta_fast = std::stof(argv[i]);
        }
        else if (arg == "--yarn-beta-slow")
        {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
        }
        else if (arg == "--memory-f32" || arg == "--memory_f32")
        {
            params.memory_f16 = false;
@ -1924,6 +1962,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_threads = std::stoi(argv[i]);
        }
        else if (arg == "--threads-batch" || arg == "-tb")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
            params.n_threads_batch = std::stoi(argv[i]);
        }
        else if (arg == "-b" || arg == "--batch-size")
        {
            if (++i >= argc)
@ -2216,8 +2263,8 @@ int main(int argc, char **argv)
    llama_backend_init(params.numa);
-    LOG_INFO("build info", {{"build", BUILD_NUMBER},
+    LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
-                            {"commit", BUILD_COMMIT}});
+                            {"commit", LLAMA_COMMIT}});
    LOG_INFO("system info", {
                                {"n_threads", params.n_threads},
@ -2281,7 +2328,7 @@ int main(int argc, char **argv)
    svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, false);
+                const int task_id = llama.request_completion(data, false, false);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@ -2336,7 +2383,7 @@ int main(int argc, char **argv)
    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, true);
+                const int task_id = llama.request_completion(data, true, false);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@ -2440,7 +2487,7 @@ int main(int argc, char **argv)
                {
                    prompt = "";
                }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
+                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
                task_result result = llama.next_result(task_id);
                return res.set_content(result.result_json.dump(), "application/json");
            });
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(512, 0, 1);
    // evaluate the initial prompt
-    batch.n_tokens = tokens_list.size();
+    for (size_t i = 0; i < tokens_list.size(); i++) {
-
+        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
    for (int32_t i = 0; i < batch.n_tokens; i++) {
        batch.token[i]  = tokens_list[i];
        batch.pos[i]    = i;
        batch.seq_id[i] = 0;
        batch.logits[i] = false;
    }
    // llama_decode will output logits only for the last token of the prompt
@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
            fflush(stdout);
            // prepare the next batch
-            batch.n_tokens = 0;
+            llama_batch_clear(batch);
            // push this new token for next evaluation
-            batch.token [batch.n_tokens] = new_token_id;
+            llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
            batch.pos   [batch.n_tokens] = n_cur;
            batch.seq_id[batch.n_tokens] = 0;
            batch.logits[batch.n_tokens] = true;
            batch.n_tokens += 1;
            n_decode += 1;
        }
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -1,5 +1,3 @@
 #include "build-info.h"
 #include "common.h"
 #include "llama.h"
@ -8,6 +6,9 @@
 #include <string>
 #include <vector>
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
 struct seq_draft {
    bool active   = false;
    bool drafting = false;
@ -36,9 +37,11 @@ int main(int argc, char ** argv) {
    // max number of parallel drafting sequences (i.e. tree branches)
    const int n_seq_dft = params.n_parallel;
-    // TODO: make this configurable
+    // probability threshold for accepting a token from the draft model
-    const float p_accept = 0.80f;
+    const float p_accept = params.p_accept;
-    const float p_split  = 0.10f;
+
    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
    const float p_split  = params.p_split;
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
    params.n_gpu_layers = params.n_gpu_layers_draft;
    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
    {
        const int n_vocab_tgt = llama_n_vocab(model_tgt);
        const int n_vocab_dft = llama_n_vocab(model_dft);
        const int vocab_diff  = n_vocab_tgt > n_vocab_dft
            ? n_vocab_tgt - n_vocab_dft
            : n_vocab_dft - n_vocab_tgt;
        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
                    n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
            return 1;
        }
        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
            const char * token_text_dft = llama_token_get_text(model_dft, i);
            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
                        llama_token_to_piece(ctx_tgt, i).c_str(),
                        llama_token_to_piece(ctx_dft, i).c_str());
                return 1;
            }
        }
    }
    // tokenize the prompt
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
@ -118,7 +148,7 @@ int main(int argc, char ** argv) {
    std::vector<seq_draft> drafts(n_seq_dft);
    params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
-    params.sparams.temp = std::max(0.01f, params.sparams.temp);
+    params.sparams.temp = -1.0f;    // force greedy sampling with probs for the draft model
    for (int s = 0; s < n_seq_dft; ++s) {
        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
            llama_decode         (ctx_dft, batch_dft);
            ++n_past_dft;
@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
            }
-            //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
+            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
            llama_decode(ctx_tgt, batch_tgt);
            ++n_past_tgt;
        }
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -349,9 +349,9 @@ static struct ggml_tensor * llama_build_train_graphs(
        // not capturing these, to silcence warnings
        const int rope_mode = 0;
-        return ggml_rope_custom(ctx,
+        return ggml_rope_custom(
-            t, KQ_pos, n_rot, rope_mode, n_ctx,
+            ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
-            rope_freq_base, rope_freq_scale);
+        );
    };
    set_name(tokens_input, "tokens_input");
--- a/flake.lock
+++ b/flake.lock
@ -5,11 +5,11 @@
        "systems": "systems"
      },
      "locked": {
-        "lastModified": 1692799911,
+        "lastModified": 1694529238,
-        "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
+        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
+        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
        "type": "github"
      },
      "original": {
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1692913444,
+        "lastModified": 1698318101,
-        "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
+        "narHash": "sha256-gUihHt3yPD7bVqg+k/UVHgngyaJ3DMEBchbymBMvK1E=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "18324978d632ffc55ef1d928e81630c620f4f447",
+        "rev": "63678e9f3d3afecfeafa0acead6239cdb447574c",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@ -11,8 +11,7 @@
        meta.mainProgram = "llama";
        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
        buildInputs = with pkgs; [ openmpi ];
-        osSpecific = with pkgs; buildInputs ++
+        osSpecific = with pkgs; buildInputs ++ (
        (
          if isAarch64 && isDarwin then
            with pkgs.darwin.apple_sdk_11_0.frameworks; [
              Accelerate
@ -51,6 +50,9 @@
        };
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
        llama-python-extra =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece torchWithoutCuda transformers ]);
        postPatch = ''
          substituteInPlace ./ggml-metal.m \
            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
@ -93,12 +95,15 @@
        };
        packages.rocm = pkgs.stdenv.mkDerivation {
          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
+          buildInputs = with pkgs.rocmPackages; buildInputs ++ [ clr hipblas rocblas ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_HIPBLAS=1"
            "-DCMAKE_C_COMPILER=hipcc"
            "-DCMAKE_CXX_COMPILER=hipcc"
-            "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
+            # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
            # in github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
            # and select the line that matches the current nixpkgs version of rocBLAS.
            "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
          ];
        };
        apps.llama-server = {
@ -126,5 +131,9 @@
          buildInputs = [ llama-python ];
          packages = nativeBuildInputs ++ osSpecific;
        };
        devShells.extra = pkgs.mkShell {
          buildInputs = [ llama-python-extra ];
          packages = nativeBuildInputs ++ osSpecific;
        };
      });
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -0,0 +1,237 @@
 #pragma once
 #include "ggml.h"
 // GGML internal header
 #include <assert.h>
 #include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
 #include <math.h>   // fabsf
 #ifdef __cplusplus
 extern "C" {
 #endif
 // static_assert should be a #define, but if it's not,
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef static_assert
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __FMA__
 #define __FMA__
 #endif
 #ifndef __F16C__
 #define __F16C__
 #endif
 #ifndef __SSE3__
 #define __SSE3__
 #endif
 #endif
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
 #if defined(__ARM_NEON) && !defined(_MSC_VER)
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
 //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
 //
 #include <arm_neon.h>
 #define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
 #define GGML_COMPUTE_FP32_TO_FP16(x) (x)
 #define GGML_FP16_TO_FP32(x) ((float) (x))
 #define GGML_FP32_TO_FP16(x) (x)
 #else
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
 #ifdef __POWER9_VECTOR__
 #include <altivec.h>
 #undef bool
 #define bool _Bool
 #else
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
 #if !defined(__riscv)
 #include <immintrin.h>
 #endif
 #endif
 #endif
 #endif
 #endif
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
 #endif
 #ifdef __F16C__
 #ifdef _MSC_VER
 #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
 #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
 #else
 #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
 #endif
 #elif defined(__POWER9_VECTOR__)
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 /* the inline asm below is about 12% faster than the lookup method */
 #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    register float f;
    register double d;
    __asm__(
        "mtfprd %0,%2\n"
        "xscvhpdp %0,%0\n"
        "frsp %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=f"(f):
        /* in */   "r"(h));
    return f;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    register double d;
    register ggml_fp16_t r;
    __asm__( /* xscvdphp can work on double or single precision */
        "xscvdphp %0,%2\n"
        "mffprd %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=r"(r):
        /* in */   "f"(f));
    return r;
 }
 #else
 // FP16 <-> FP32
 // ref: https://github.com/Maratyszcza/FP16
 static inline float fp32_from_bits(uint32_t w) {
    union {
        uint32_t as_bits;
        float as_value;
    } fp32;
    fp32.as_bits = w;
    return fp32.as_value;
 }
 static inline uint32_t fp32_to_bits(float f) {
    union {
        float as_value;
        uint32_t as_bits;
    } fp32;
    fp32.as_value = f;
    return fp32.as_bits;
 }
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    const uint32_t w = (uint32_t) h << 16;
    const uint32_t sign = w & UINT32_C(0x80000000);
    const uint32_t two_w = w + w;
    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float exp_scale = 0x1.0p-112f;
 #else
    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
 #endif
    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
    const uint32_t magic_mask = UINT32_C(126) << 23;
    const float magic_bias = 0.5f;
    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
    const uint32_t result = sign |
        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
    return fp32_from_bits(result);
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float scale_to_inf = 0x1.0p+112f;
    const float scale_to_zero = 0x1.0p-110f;
 #else
    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
 #endif
    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
    const uint32_t w = fp32_to_bits(f);
    const uint32_t shl1_w = w + w;
    const uint32_t sign = w & UINT32_C(0x80000000);
    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
    if (bias < UINT32_C(0x71000000)) {
        bias = UINT32_C(0x71000000);
    }
    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
    const uint32_t bits = fp32_to_bits(base);
    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
    const uint32_t nonsign = exp_bits + mantissa_bits;
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #endif // __F16C__
 #endif // __ARM_NEON
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
 extern float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
    return ggml_table_f32_f16[s];
 }
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
    // TODO: backend v2 PR
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
            NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
            if (sourcePath == nil) {
                GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
                sourcePath = @"ggml-metal.metal";
            }
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
            if (error) {
@ -234,12 +238,15 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    // load kernels
    {
        NSError * error = nil;
-#define GGML_METAL_ADD_KERNEL(name) \
+
-        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+        /*
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
        GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
                (int) ctx->pipeline_##name.threadExecutionWidth); \
        */
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
        if (error) { \
            GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
@ -994,11 +1001,15 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
-                            const int nth = MIN(32, ne00);
+                            int nth = 32; // SIMD width
                            if (ne00%4 == 0) {
                                [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
                            } else {
                                do {
                                    nth *= 2;
                                } while (nth <= ne00 && nth <= 1024);
                                nth /= 2;
                                [encoder setComputePipelineState:ctx->pipeline_soft_max];
                            }
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -1006,8 +1017,9 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
                            [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_DIAG_MASK_INF:
                        {
@ -1336,7 +1348,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];
                            const int64_t nrows = ggml_nrows(src0);
@ -1391,11 +1403,16 @@ void ggml_metal_graph_compute(
                            const int n_past     = ((int32_t *) dst->op_params)[0];
                            const int n_dims     = ((int32_t *) dst->op_params)[1];
                            const int mode       = ((int32_t *) dst->op_params)[2];
                            // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
                            const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-                            float freq_base;
+                            float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-                            float freq_scale;
+                            memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-                            memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+                            memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+                            memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
                            memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
                            memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
                            switch (src0->type) {
                                case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
@ -1425,8 +1442,13 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
                            [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
                            [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
-                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:22];
+                            [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
-                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
+                            [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
                            [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
                            [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
                            [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
                            [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
                            [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -184,36 +184,73 @@ kernel void kernel_soft_max(
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
-        uint3 tgpig[[threadgroup_position_in_grid]],
+        threadgroup float  * buf [[threadgroup(0)]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint  tgpig[[threadgroup_position_in_grid]],
-        uint3   ntg[[threads_per_threadgroup]]) {
+        uint  tpitg[[thread_position_in_threadgroup]],
-    const int64_t i03 = tgpig[2];
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
-    const int64_t i02 = tgpig[1];
+        uint  tiisg[[thread_index_in_simdgroup]],
-    const int64_t i01 = tgpig[0];
+        uint    ntg[[threads_per_threadgroup]]) {
    const int64_t i03 = (tgpig) / (ne02*ne01);
    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
    device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    // parallel max
-    float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
+    float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
-    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
+
    for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
        lmax = MAX(lmax, psrc0[i00]);
    }
-    const float max = simd_max(lmax);
+
    float max = simd_max(lmax);
    if (tiisg == 0) {
        buf[sgitg] = max;
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // broadcast, simd group number is ntg / 32
    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
       if (tpitg < i) {
           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
       }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    max = buf[0];
    // parallel sum
    float lsum = 0.0f;
-    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
        const float exp_psrc0 = exp(psrc0[i00] - max);
        lsum += exp_psrc0;
        // Remember the result of exp here. exp is expensive, so we really do not
-        // whish to compute it twice.
+        // wish to compute it twice.
        pdst[i00] = exp_psrc0;
    }
-    const float sum = simd_sum(lsum);
+    float sum = simd_sum(lsum);
    if (tiisg == 0) {
        buf[sgitg] = sum;
    }
-    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
+    threadgroup_barrier(mem_flags::mem_threadgroup);
    // broadcast, simd group number is ntg / 32
    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
       if (tpitg < i) {
           buf[tpitg] += buf[tpitg + i];
       }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    sum = buf[0];
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
        pdst[i00] /= sum;
    }
 }
@ -224,37 +261,73 @@ kernel void kernel_soft_max_4(
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
-        uint3 tgpig[[threadgroup_position_in_grid]],
+        threadgroup float  * buf [[threadgroup(0)]],
-        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint  tgpig[[threadgroup_position_in_grid]],
-        uint3   ntg[[threads_per_threadgroup]]) {
+        uint  tpitg[[thread_position_in_threadgroup]],
-    const int64_t i03 = tgpig[2];
+        uint  sgitg[[simdgroup_index_in_threadgroup]],
-    const int64_t i02 = tgpig[1];
+        uint  tiisg[[thread_index_in_simdgroup]],
-    const int64_t i01 = tgpig[0];
+        uint    ntg[[threads_per_threadgroup]]) {
    const int64_t i03 = (tgpig) / (ne02*ne01);
    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
    device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    device       float4 * pdst4 = (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    // parallel max
-    float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
+    float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
-    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
+
    for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
        lmax4 = fmax(lmax4, psrc4[i00]);
    }
    float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
-    const float max = simd_max(lmax);
+    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
    float max = simd_max(lmax);
    if (tiisg == 0) {
        buf[sgitg] = max;
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    // broadcast, simd group number is ntg / 32
    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
       if (tpitg < i) {
           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
       }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    max = buf[0];
    // parallel sum
    float4 lsum4 = 0.0f;
-    for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
        const float4 exp_psrc4 = exp(psrc4[i00] - max);
        lsum4 += exp_psrc4;
        pdst4[i00] = exp_psrc4;
    }
    float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
-    const float sum = simd_sum(lsum);
+    const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
    float sum = simd_sum(lsum);
    if (tiisg == 0) {
        buf[sgitg] = sum;
    }
-    for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
+    threadgroup_barrier(mem_flags::mem_threadgroup);
    // broadcast, simd group number is ntg / 32
    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
       if (tpitg < i) {
           buf[tpitg] += buf[tpitg + i];
       }
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    sum = buf[0];
    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
        pdst4[i00] /= sum;
    }
 }
@ -988,6 +1061,45 @@ kernel void kernel_alibi_f32(
    }
 }
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
    return 1.0f - min(1.0f, max(0.0f, y));
 }
 // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static void rope_yarn(
    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
    thread float * cos_theta, thread float * sin_theta
 ) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
    if (ext_factor != 0.0f) {
        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
        // Get n-d magnitude scaling corrected for interpolation
        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
    }
    *cos_theta = cos(theta) * mscale;
    *sin_theta = sin(theta) * mscale;
 }
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
 static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
    return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
 }
 static void rope_yarn_corr_dims(
    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
 ) {
    // start and end correction dims
    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
 }
 typedef void (rope_t)(
        device const    void * src0,
        device const int32_t * src1,
@ -1011,8 +1123,13 @@ typedef void (rope_t)(
        constant         int & n_past,
        constant         int & n_dims,
        constant         int & mode,
        constant         int & n_orig_ctx,
        constant       float & freq_base,
        constant       float & freq_scale,
        constant       float & ext_factor,
        constant       float & attn_factor,
        constant       float & beta_fast,
        constant       float & beta_slow,
        uint  tiitg[[thread_index_in_threadgroup]],
        uint3 tptg[[threads_per_threadgroup]],
        uint3 tgpig[[threadgroup_position_in_grid]]);
@ -1041,8 +1158,13 @@ kernel void kernel_rope(
        constant         int & n_past,
        constant         int & n_dims,
        constant         int & mode,
        constant         int & n_orig_ctx,
        constant       float & freq_base,
        constant       float & freq_scale,
        constant       float & ext_factor,
        constant       float & attn_factor,
        constant       float & beta_fast,
        constant       float & beta_slow,
        uint  tiitg[[thread_index_in_threadgroup]],
        uint3 tptg[[threads_per_threadgroup]],
        uint3 tgpig[[threadgroup_position_in_grid]]) {
@ -1052,19 +1174,22 @@ kernel void kernel_rope(
    const bool is_neox = mode & 2;
    float corr_dims[2];
    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
    device const int32_t * pos = src1;
    const int64_t p = pos[i2];
-    const float theta_0 = freq_scale * (float)p;
+    const float theta_0 = (float)p;
    const float inv_ndims = -1.f/n_dims;
    if (!is_neox) {
        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
-            const float cos_theta = cos(theta);
+            float cos_theta, sin_theta;
-            const float sin_theta = sin(theta);
+            rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@ -1079,9 +1204,12 @@ kernel void kernel_rope(
        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {
-                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
+                // simplified from `(ib * n_dims + ic) * inv_ndims`
-                const float cos_theta = cos(theta);
+                const float cur_rot = inv_ndims*ic - ib;
-                const float sin_theta = sin(theta);
+
                const float theta = theta_0 * pow(freq_base, cur_rot);
                float cos_theta, sin_theta;
                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
                const int64_t i0 = ib*n_dims + ic/2;
--- a/ggml-quants.c
+++ b/ggml-quants.c
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -1,11 +1,63 @@
 #pragma once
-#include "ggml.h"
+#include "ggml-impl.h"
 // GGML internal header
 #include <stdint.h>
 #include <assert.h>
 #include <stddef.h>
 #define QK4_0 32
 typedef struct {
    ggml_fp16_t d;          // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
 static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
 #define QK4_1 32
 typedef struct {
    ggml_fp16_t d;          // delta
    ggml_fp16_t m;          // min
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
 #define QK5_0 32
 typedef struct {
    ggml_fp16_t d;         // delta
    uint8_t qh[4];         // 5-th bit of quants
    uint8_t qs[QK5_0 / 2]; // nibbles / quants
 } block_q5_0;
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
 #define QK5_1 32
 typedef struct {
    ggml_fp16_t d;         // delta
    ggml_fp16_t m;         // min
    uint8_t qh[4];         // 5-th bit of quants
    uint8_t qs[QK5_1 / 2]; // nibbles / quants
 } block_q5_1;
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 #define QK8_0 32
 typedef struct {
    ggml_fp16_t d;         // delta
    int8_t  qs[QK8_0];     // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
 #define QK8_1 32
 typedef struct {
    float d;               // delta
    float s;               // d * sum(qs[i])
    int8_t  qs[QK8_1];     // quants
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
 //
 // Super-block quantization structures
 //
 // Super-block size
 #ifdef GGML_QKK_64
 #define QK_K 64
@ -15,18 +67,6 @@
 #define K_SCALE_SIZE 12
 #endif
 #ifndef static_assert
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 //
 // Super-block quantization structures
 //
 // 2-bit quantization
 // weight is represented as x = a * q + b
 // 16 blocks of 16 elements each
@ -127,6 +167,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
 // Quantization
 void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
 void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
 void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
 void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
 void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
 void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
 void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
 void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
@ -134,6 +181,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
 void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
 void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
 void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
 void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
 void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
 void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
 void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
@ -142,6 +196,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
 void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
 void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
 void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
 void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
 void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
 //void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
 void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
 void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
 void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
@ -150,16 +211,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
 void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 // Quantization with histogram collection
 size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
 size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
 size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
 size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
 size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -219,7 +219,7 @@
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
-#define GGML_MAX_OP_PARAMS     32
+#define GGML_MAX_OP_PARAMS     64
 #define GGML_DEFAULT_N_THREADS 4
 #if UINTPTR_MAX == 0xFFFFFFFF
@ -401,15 +401,16 @@ extern "C" {
        GGML_OP_ALIBI,
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
-        GGML_OP_CONV_2D,
+        GGML_OP_CONV_1D_STAGE_0,  // internal
        GGML_OP_CONV_1D_STAGE_1,  // internal
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_CONV_2D,
        GGML_OP_CONV_2D_STAGE_0, // internal
        GGML_OP_CONV_2D_STAGE_1, // internal
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_CONV_1D_STAGE_0,  // internal
        GGML_OP_CONV_1D_STAGE_1,  // internal
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_FLASH_ATTN,
@ -1020,9 +1021,9 @@ extern "C" {
            struct ggml_tensor  * b,
            float                 eps);
-    // A: n columns, m rows
+    // A: k columns, n rows => [ne03, ne02, n, k]
-    // B: n columns, p rows  (i.e. we transpose it internally)
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
-    // result is m columns, p rows
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
    GGML_API struct ggml_tensor * ggml_mul_mat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1325,8 +1326,13 @@ extern "C" {
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
            int                   n_orig_ctx,
            float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
            float                 ext_factor,
            float                 attn_factor,
            float                 beta_fast,
            float                 beta_slow);
    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
@ -1336,8 +1342,17 @@ extern "C" {
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
            int                   n_orig_ctx,
            float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
            float                 ext_factor,
            float                 attn_factor,
            float                 beta_fast,
            float                 beta_slow);
    // compute correction dims for YaRN RoPE scaling
    void ggml_rope_yarn_corr_dims(
        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
    // xPos RoPE, in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
@ -1929,12 +1944,19 @@ extern "C" {
    // quantization
    //
    // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
    //
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@ -7,7 +7,7 @@ import shutil
 import struct
 import sys
 import tempfile
-from enum import IntEnum, auto
+from enum import Enum, IntEnum, auto
 from io import BufferedWriter
 from pathlib import Path
 from typing import IO, Any, BinaryIO, Callable, Sequence
@ -55,7 +55,10 @@ KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
 # RoPE
 KEY_ROPE_DIMENSION_COUNT         = "{arch}.rope.dimension_count"
 KEY_ROPE_FREQ_BASE               = "{arch}.rope.freq_base"
-KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
+KEY_ROPE_SCALING_TYPE            = "{arch}.rope.scaling.type"
 KEY_ROPE_SCALING_FACTOR          = "{arch}.rope.scaling.factor"
 KEY_ROPE_SCALING_ORIG_CTX_LEN    = "{arch}.rope.scaling.original_context_length"
 KEY_ROPE_SCALING_FINETUNED       = "{arch}.rope.scaling.finetuned"
 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
@ -408,6 +411,7 @@ class TensorNameMap:
            "layers.{bid}.attention_norm",                         # llama-pth
            "encoder.layer.{bid}.attention.output.LayerNorm",      # bert
            "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
            "model.layers.{bid}.ln1",                              # yi
        ),
        # Attention norm 2
@ -479,6 +483,7 @@ class TensorNameMap:
            "layers.{bid}.ffn_norm",                                        # llama-pth
            "encoder.layer.{bid}.output.LayerNorm",                         # bert
            "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
            "model.layers.{bid}.ln2",                                       # yi
        ),
        # Feed-forward up
@ -595,6 +600,11 @@ class TokenType(IntEnum):
    UNUSED       = 5
    BYTE         = 6
 class RopeScalingType(Enum):
    NONE   = 'none'
    LINEAR = 'linear'
    YARN   = 'yarn'
 #
 # implementation
 #
@ -966,8 +976,17 @@ class GGUFWriter:
    def add_rope_freq_base(self, value: float):
        self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
-    def add_rope_scale_linear(self, value: float):
+    def add_rope_scaling_type(self, value: RopeScalingType):
-        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
+        self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value)
    def add_rope_scaling_factor(self, value: float):
        self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value)
    def add_rope_scaling_orig_ctx_len(self, value: int):
        self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
    def add_rope_scaling_finetuned(self, value: bool):
        self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value)
    def add_tokenizer_model(self, model: str):
        self.add_string(KEY_TOKENIZER_MODEL, model)
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -106,6 +106,14 @@ extern "C" {
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
    enum llama_rope_scaling_type {
        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
        LLAMA_ROPE_SCALING_NONE        = 0,
        LLAMA_ROPE_SCALING_LINEAR      = 1,
        LLAMA_ROPE_SCALING_YARN        = 2,
        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
    };
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@ -172,13 +180,19 @@ extern "C" {
        uint32_t n_batch;           // prompt processing maximum batch size
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
        float    yarn_attn_factor; // YaRN magnitude scaling factor
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size
        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool embedding;  // embedding mode only
@ -191,6 +205,7 @@ extern "C" {
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
    } llama_model_quantize_params;
    // grammar types
@ -333,15 +348,12 @@ extern "C" {
    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
            "avoid using this, it will be removed in the future, instead - count the tokens in user code");
-    // Remove all tokens data of cells in [c0, c1)
+    // Clear the KV cache
-    // c0 < 0 : [0,  c1]
+    LLAMA_API void llama_kv_cache_clear(
-    // c1 < 0 : [c0, inf)
+            struct llama_context * ctx);
    LLAMA_API void llama_kv_cache_tokens_rm(
            struct llama_context * ctx,
                         int32_t   c0,
                         int32_t   c1);
    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
    // seq_id < 0 : match any sequence
    // p0 < 0     : [0,  p1]
    // p1 < 0     : [p0, inf)
    LLAMA_API void llama_kv_cache_seq_rm(
@ -600,6 +612,13 @@ extern "C" {
                           float   p,
                          size_t   min_keep);
    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
    LLAMA_API void llama_sample_min_p(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   p,
                          size_t   min_keep);
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
    LLAMA_API void llama_sample_tail_free(
            struct llama_context * ctx,
@ -658,6 +677,7 @@ extern "C" {
                           float * mu);
    /// @details Selects the token with the highest probability.
    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
    LLAMA_API llama_token llama_sample_token_greedy(
            struct llama_context * ctx,
          llama_token_data_array * candidates);
--- a/models/ggml-vocab-llama.gguf
+++ b/models/ggml-vocab-llama.gguf
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@ -1,5 +1,5 @@
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
-set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
+set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
 set(BUILD_COMPILER "unknown")
@ -24,16 +24,22 @@ if(Git_FOUND)
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE HEAD
        OUTPUT_STRIP_TRAILING_WHITESPACE
        RESULT_VARIABLE RES
    )
    if (RES EQUAL 0)
        set(BUILD_COMMIT ${HEAD})
    endif()
    execute_process(
        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE COUNT
        OUTPUT_STRIP_TRAILING_WHITESPACE
        RESULT_VARIABLE RES
    )
-    set(BUILD_COMMIT ${HEAD})
+    if (RES EQUAL 0)
        set(BUILD_NUMBER ${COUNT})
    endif()
 endif()
 if(MSVC)
    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
@ -53,22 +59,22 @@ else()
    set(BUILD_TARGET ${OUT})
 endif()
-# Only write the header if it's changed to prevent unnecessary recompilation
+# Only write the build info if it changed
-if(EXISTS ${HEADER_FILE})
+if(EXISTS ${OUTPUT_FILE})
-    file(READ ${HEADER_FILE} CONTENTS)
+    file(READ ${OUTPUT_FILE} CONTENTS)
-    string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
+    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
+    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
+    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_TARGET ${CMAKE_MATCH_1})
    if (
        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
        NOT OLD_TARGET   STREQUAL BUILD_TARGET
    )
-        configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
    endif()
 else()
-    configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
 endif()
--- a/scripts/build-info.h.in
+++ b/scripts/build-info.h.in
@ -1,9 +0,0 @@
 #ifndef BUILD_INFO_H
 #define BUILD_INFO_H
 #define BUILD_NUMBER @BUILD_NUMBER@
 #define BUILD_COMMIT "@BUILD_COMMIT@"
 #define BUILD_COMPILER "@BUILD_COMPILER@"
 #define BUILD_TARGET "@BUILD_TARGET@"
 #endif // BUILD_INFO_H
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@ -24,12 +24,7 @@ if out=$($CC -dumpmachine); then
  build_target=$out
 fi
-echo "#ifndef BUILD_INFO_H"
+echo "int LLAMA_BUILD_NUMBER = ${build_number};"
-echo "#define BUILD_INFO_H"
+echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
-echo
+echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
-echo "#define BUILD_NUMBER $build_number"
+echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
 echo "#define BUILD_COMMIT \"$build_commit\""
 echo "#define BUILD_COMPILER \"$build_compiler\""
 echo "#define BUILD_TARGET \"$build_target\""
 echo
 echo "#endif // BUILD_INFO_H"
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@ -0,0 +1,391 @@
 #!/bin/bash
 #
 # Helper script for deploying llama.cpp server with a single Bash command
 #
 # - Works on Linux and macOS
 # - Supports: CPU, CUDA, Metal, OpenCL
 # - Can run all GGUF models from HuggingFace
 # - Can serve requests in parallel
 # - Always builds latest llama.cpp from GitHub
 #
 # Limitations
 #
 # - Chat templates are poorly supported (base models recommended)
 # - Might be unstable!
 #
 # Usage:
 #   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
 #
 #   --port:       port number, default is 8888
 #   --repo:       path to a repo containing GGUF model files
 #   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input
 #   --backend:    cpu, cuda, metal, opencl, depends on the OS
 #   --gpu-id:     gpu id, default is 0
 #   --n-parallel: number of parallel requests, default is 8
 #   --n-kv:       KV cache size, default is 4096
 #   --verbose:    verbose output
 #
 # Example:
 #
 #   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
 #
 set -e
 # required utils: curl, git, make
 if ! command -v curl &> /dev/null; then
    printf "[-] curl not found\n"
    exit 1
 fi
 if ! command -v git &> /dev/null; then
    printf "[-] git not found\n"
    exit 1
 fi
 if ! command -v make &> /dev/null; then
    printf "[-] make not found\n"
    exit 1
 fi
 # parse arguments
 port=8888
 repo=""
 wtype=""
 backend="cpu"
 # if macOS, use metal backend by default
 if [[ "$OSTYPE" == "darwin"* ]]; then
    backend="metal"
 elif command -v nvcc &> /dev/null; then
    backend="cuda"
 fi
 gpu_id=0
 n_parallel=8
 n_kv=4096
 verbose=0
 function print_usage {
    printf "Usage:\n"
    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
    printf "  --port:       port number, default is 8888\n"
    printf "  --repo:       path to a repo containing GGUF model files\n"
    printf "  --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
    printf "  --backend:    cpu, cuda, metal, opencl, depends on the OS\n"
    printf "  --gpu-id:     gpu id, default is 0\n"
    printf "  --n-parallel: number of parallel requests, default is 8\n"
    printf "  --n-kv:       KV cache size, default is 4096\n"
    printf "  --verbose:    verbose output\n\n"
    printf "Example:\n\n"
    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
 }
 while [[ $# -gt 0 ]]; do
    key="$1"
    case $key in
        --port)
            port="$2"
            shift
            shift
            ;;
        --repo)
            repo="$2"
            shift
            shift
            ;;
        --wtype)
            wtype="$2"
            shift
            shift
            ;;
        --backend)
            backend="$2"
            shift
            shift
            ;;
        --gpu-id)
            gpu_id="$2"
            shift
            shift
            ;;
        --n-parallel)
            n_parallel="$2"
            shift
            shift
            ;;
        --n-kv)
            n_kv="$2"
            shift
            shift
            ;;
        --verbose)
            verbose=1
            shift
            ;;
        --help)
            print_usage
            exit 0
            ;;
        *)
            echo "Unknown argument: $key"
            print_usage
            exit 1
            ;;
    esac
 done
 # available weights types
 wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
 wfiles=()
 for wt in "${wtypes[@]}"; do
    wfiles+=("")
 done
 # sample repos
 repos=(
    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
 )
 printf "\n"
 printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
 printf "    Based on the options that follow, the script might download a model file\n"
 printf "    from the internet, which can be a few GBs in size. The script will also\n"
 printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
 printf "\n"
 printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
 printf "    model using llama.cpp for demonstration purposes.\n"
 printf "\n"
 printf "    Please note:\n"
 printf "\n"
 printf "    - All new data will be stored in the current folder\n"
 printf "    - The server will be listening on all network interfaces\n"
 printf "    - The server will run with default settings which are not always optimal\n"
 printf "    - Do not judge the quality of a model based on the results from this script\n"
 printf "    - Do not use this script to benchmark llama.cpp\n"
 printf "    - Do not use this script in production\n"
 printf "    - This script is only for demonstration purposes\n"
 printf "\n"
 printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
 printf "\n"
 printf "    Press Enter to continue ...\n\n"
 read
 if [[ -z "$repo" ]]; then
    printf "[+] No repo provided from the command line\n"
    printf "    Please select a number from the list below or enter an URL:\n\n"
    is=0
    for r in "${repos[@]}"; do
        printf "    %2d) %s\n" $is "$r"
        is=$((is+1))
    done
    # ask for repo until index of sample repo is provided or an URL
    while [[ -z "$repo" ]]; do
        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
        read -p "[+] Select repo: " repo
        # check if the input is a number
        if [[ "$repo" =~ ^[0-9]+$ ]]; then
            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
                repo="${repos[$repo]}"
            else
                printf "[-] Invalid repo index: %s\n" "$repo"
                repo=""
            fi
        elif [[ "$repo" =~ ^https?:// ]]; then
            repo="$repo"
        else
            printf "[-] Invalid repo URL: %s\n" "$repo"
            repo=""
        fi
    done
 fi
 # remove suffix
 repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
 printf "[+] Checking for GGUF model files in %s\n" "$repo"
 # find GGUF files in the source
 # TODO: better logic
 model_tree="${repo%/}/tree/main"
 model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
 # list all files in the provided git repo
 printf "[+] Model files:\n\n"
 for file in $model_files; do
    # determine iw by grepping the filename with wtypes
    iw=-1
    is=0
    for wt in "${wtypes[@]}"; do
        # uppercase
        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
        if [[ "$ufile" =~ "$wt" ]]; then
            iw=$is
            break
        fi
        is=$((is+1))
    done
    if [[ $iw -eq -1 ]]; then
        continue
    fi
    wfiles[$iw]="$file"
    have=" "
    if [[ -f "$file" ]]; then
        have="*"
    fi
    printf "    %2d) %s %s\n" $iw "$have" "$file"
 done
 # ask for weights type until provided and available
 while [[ -z "$wtype" ]]; do
    printf "\n"
    read -p "[+] Select weight type: " wtype
    wfile="${wfiles[$wtype]}"
    if [[ -z "$wfile" ]]; then
        printf "[-] Invalid weight type: %s\n" "$wtype"
        wtype=""
    fi
 done
 printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
 url="${repo%/}/resolve/main/$wfile"
 # check file if the model has been downloaded before
 chk="$wfile.chk"
 # check if we should download the file
 # - if $wfile does not exist
 # - if $wfile exists but $chk does not exist
 # - if $wfile exists and $chk exists but $wfile is newer than $chk
 # TODO: better logic using git lfs info
 do_download=0
 if [[ ! -f "$wfile" ]]; then
    do_download=1
 elif [[ ! -f "$chk" ]]; then
    do_download=1
 elif [[ "$wfile" -nt "$chk" ]]; then
    do_download=1
 fi
 if [[ $do_download -eq 1 ]]; then
    printf "[+] Downloading weights from %s\n" "$url"
    # download the weights file
    curl -o "$wfile" -# -L "$url"
    # create a check file if successful
    if [[ $? -eq 0 ]]; then
        printf "[+] Creating check file %s\n" "$chk"
        touch "$chk"
    fi
 else
    printf "[+] Using cached weights %s\n" "$wfile"
 fi
 # get latest llama.cpp and build
 printf "[+] Downloading latest llama.cpp\n"
 llama_cpp_dir="__llama_cpp_port_${port}__"
 if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
    printf "[-] Please remove it and try again\n"
    exit 1
 elif [[ -d "$llama_cpp_dir" ]]; then
    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
    printf "[+] Using cached llama.cpp\n"
    cd "$llama_cpp_dir"
    git reset --hard
    git fetch
    git checkout origin/master
    cd ..
 else
    printf "[+] Cloning llama.cpp\n"
    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
 fi
 # mark that that the directory is made by this script
 touch "$llama_cpp_dir/__ggml_script__"
 if [[ $verbose -eq 1 ]]; then
    set -x
 fi
 # build
 cd "$llama_cpp_dir"
 make clean
 log="--silent"
 if [[ $verbose -eq 1 ]]; then
    log=""
 fi
 if [[ "$backend" == "cuda" ]]; then
    printf "[+] Building with CUDA backend\n"
    LLAMA_CUBLAS=1 make -j server $log
 elif [[ "$backend" == "cpu" ]]; then
    printf "[+] Building with CPU backend\n"
    make -j server $log
 elif [[ "$backend" == "metal" ]]; then
    printf "[+] Building with Metal backend\n"
    make -j server $log
 elif [[ "$backend" == "opencl" ]]; then
    printf "[+] Building with OpenCL backend\n"
    LLAMA_CLBLAST=1 make -j server $log
 else
    printf "[-] Unknown backend: %s\n" "$backend"
    exit 1
 fi
 # run the server
 printf "[+] Running server\n"
 args=""
 if [[ "$backend" == "cuda" ]]; then
    export CUDA_VISIBLE_DEVICES=$gpu_id
    args="-ngl 999"
 elif [[ "$backend" == "cpu" ]]; then
    args="-ngl 0"
 elif [[ "$backend" == "metal" ]]; then
    args="-ngl 999"
 elif [[ "$backend" == "opencl" ]]; then
    args="-ngl 999"
 else
    printf "[-] Unknown backend: %s\n" "$backend"
    exit 1
 fi
 if [[ $verbose -eq 1 ]]; then
    args="$args --verbose"
 fi
 ./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
 exit 0
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@ -4,7 +4,7 @@
 #undef NDEBUG
 #include <cassert>
-#if !defined(__riscv) && !defined(__s390__)
+#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
 #include <immintrin.h>
 #endif
 #include <cmath>
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@ -129,6 +129,13 @@ int main(int argc, char * argv[]) {
        ggml_type type = (ggml_type) i;
        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
        // deprecated - skip
        if (qfns.blck_size == 0) {
            continue;
        }
        printf("Testing %s\n", ggml_type_name((ggml_type) i));
        if (qfns.from_float && qfns.to_float) {
            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
            const float max_quantization_error =